xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 27f49a208b01d2e827ab9db411a2d16003fe9262)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
69 
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
90 
91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
93 
94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97 
98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99 {
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
101 
102   PetscFunctionBegin;
103   switch (op) {
104   case MAT_CUSPARSE_MULT:
105     cusparsestruct->format = format;
106     break;
107   case MAT_CUSPARSE_ALL:
108     cusparsestruct->format = format;
109     break;
110   default:
111     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112   }
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 /*@
117    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118    operation. Only the `MatMult()` operation can use different GPU storage formats
119 
120    Not Collective
121 
122    Input Parameters:
123 +  A - Matrix of type `MATSEQAIJCUSPARSE`
124 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127 
128    Level: intermediate
129 
130 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131 @*/
132 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133 {
134   PetscFunctionBegin;
135   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
136   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137   PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 
140 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141 {
142   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
143 
144   PetscFunctionBegin;
145   cusparsestruct->use_cpu_solve = use_cpu;
146   PetscFunctionReturn(PETSC_SUCCESS);
147 }
148 
149 /*@
150    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
151 
152    Input Parameters:
153 +  A - Matrix of type `MATSEQAIJCUSPARSE`
154 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
155 
156    Level: intermediate
157 
158    Note:
159    The cuSparse LU solver currently computes the factors with the built-in CPU method
160    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
162 
163 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
169   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170   PetscFunctionReturn(PETSC_SUCCESS);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177   case MAT_FORM_EXPLICIT_TRANSPOSE:
178     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180     A->form_explicit_transpose = flg;
181     break;
182   default:
183     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184     break;
185   }
186   PetscFunctionReturn(PETSC_SUCCESS);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
194   IS                  isrow = b->row, iscol = b->col;
195   PetscBool           row_identity, col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow, &row_identity));
204   PetscCall(ISIdentity(iscol, &col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve          = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
224 {
225   MatCUSPARSEStorageFormat format;
226   PetscBool                flg;
227   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
228 
229   PetscFunctionBegin;
230   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
231   if (A->factortype == MAT_FACTOR_NONE) {
232     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
233     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
234 
235     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
237     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
238     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
239 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
240     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
241     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
242   #if CUSPARSE_VERSION > 11301
243     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
244   #else
245     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
246   #endif
247     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
248     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
249 
250     PetscCall(
251       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
252     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
253 #endif
254   }
255   PetscOptionsHeadEnd();
256   PetscFunctionReturn(PETSC_SUCCESS);
257 }
258 
259 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
260 {
261   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
262   PetscInt                           n                  = A->rmap->n;
263   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
264   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
265   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
266   const MatScalar                   *aa = a->a, *v;
267   PetscInt                          *AiLo, *AjLo;
268   PetscInt                           i, nz, nzLower, offset, rowOffset;
269 
270   PetscFunctionBegin;
271   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
272   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
273     try {
274       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
275       nzLower = n + ai[n] - ai[1];
276       if (!loTriFactor) {
277         PetscScalar *AALo;
278 
279         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
280 
281         /* Allocate Space for the lower triangular matrix */
282         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
283         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
284 
285         /* Fill the lower triangular matrix */
286         AiLo[0]   = (PetscInt)0;
287         AiLo[n]   = nzLower;
288         AjLo[0]   = (PetscInt)0;
289         AALo[0]   = (MatScalar)1.0;
290         v         = aa;
291         vi        = aj;
292         offset    = 1;
293         rowOffset = 1;
294         for (i = 1; i < n; i++) {
295           nz = ai[i + 1] - ai[i];
296           /* additional 1 for the term on the diagonal */
297           AiLo[i] = rowOffset;
298           rowOffset += nz + 1;
299 
300           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
301           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
302 
303           offset += nz;
304           AjLo[offset] = (PetscInt)i;
305           AALo[offset] = (MatScalar)1.0;
306           offset += 1;
307 
308           v += nz;
309           vi += nz;
310         }
311 
312         /* allocate space for the triangular factor information */
313         PetscCall(PetscNew(&loTriFactor));
314         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
315         /* Create the matrix description */
316         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
317         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
318 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
319         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
320 #else
321         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
322 #endif
323         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
324         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
325 
326         /* set the operation */
327         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
328 
329         /* set the matrix */
330         loTriFactor->csrMat              = new CsrMatrix;
331         loTriFactor->csrMat->num_rows    = n;
332         loTriFactor->csrMat->num_cols    = n;
333         loTriFactor->csrMat->num_entries = nzLower;
334 
335         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
336         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
337 
338         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
340 
341         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
343 
344         /* Create the solve analysis information */
345         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
346         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
347 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
348         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
349                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
350         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
351 #endif
352 
353         /* perform the solve analysis */
354         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
355                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
356         PetscCallCUDA(WaitForCUDA());
357         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
358 
359         /* assign the pointer */
360         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
361         loTriFactor->AA_h                                          = AALo;
362         PetscCallCUDA(cudaFreeHost(AiLo));
363         PetscCallCUDA(cudaFreeHost(AjLo));
364         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
365       } else { /* update values only */
366         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
367         /* Fill the lower triangular matrix */
368         loTriFactor->AA_h[0] = 1.0;
369         v                    = aa;
370         vi                   = aj;
371         offset               = 1;
372         for (i = 1; i < n; i++) {
373           nz = ai[i + 1] - ai[i];
374           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
375           offset += nz;
376           loTriFactor->AA_h[offset] = 1.0;
377           offset += 1;
378           v += nz;
379         }
380         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
381         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
382       }
383     } catch (char *ex) {
384       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
385     }
386   }
387   PetscFunctionReturn(PETSC_SUCCESS);
388 }
389 
390 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
391 {
392   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
393   PetscInt                           n                  = A->rmap->n;
394   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
395   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
396   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
397   const MatScalar                   *aa = a->a, *v;
398   PetscInt                          *AiUp, *AjUp;
399   PetscInt                           i, nz, nzUpper, offset;
400 
401   PetscFunctionBegin;
402   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
403   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
404     try {
405       /* next, figure out the number of nonzeros in the upper triangular matrix. */
406       nzUpper = adiag[0] - adiag[n];
407       if (!upTriFactor) {
408         PetscScalar *AAUp;
409 
410         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
411 
412         /* Allocate Space for the upper triangular matrix */
413         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
414         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
415 
416         /* Fill the upper triangular matrix */
417         AiUp[0] = (PetscInt)0;
418         AiUp[n] = nzUpper;
419         offset  = nzUpper;
420         for (i = n - 1; i >= 0; i--) {
421           v  = aa + adiag[i + 1] + 1;
422           vi = aj + adiag[i + 1] + 1;
423 
424           /* number of elements NOT on the diagonal */
425           nz = adiag[i] - adiag[i + 1] - 1;
426 
427           /* decrement the offset */
428           offset -= (nz + 1);
429 
430           /* first, set the diagonal elements */
431           AjUp[offset] = (PetscInt)i;
432           AAUp[offset] = (MatScalar)1. / v[nz];
433           AiUp[i]      = AiUp[i + 1] - (nz + 1);
434 
435           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
436           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
437         }
438 
439         /* allocate space for the triangular factor information */
440         PetscCall(PetscNew(&upTriFactor));
441         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
442 
443         /* Create the matrix description */
444         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
445         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
446 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
447         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
448 #else
449         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
450 #endif
451         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
452         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
453 
454         /* set the operation */
455         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
456 
457         /* set the matrix */
458         upTriFactor->csrMat              = new CsrMatrix;
459         upTriFactor->csrMat->num_rows    = n;
460         upTriFactor->csrMat->num_cols    = n;
461         upTriFactor->csrMat->num_entries = nzUpper;
462 
463         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
464         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
465 
466         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
467         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
468 
469         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
470         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
471 
472         /* Create the solve analysis information */
473         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
474         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
475 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
476         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
477                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
478         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
479 #endif
480 
481         /* perform the solve analysis */
482         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
483                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
484 
485         PetscCallCUDA(WaitForCUDA());
486         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
487 
488         /* assign the pointer */
489         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
490         upTriFactor->AA_h                                          = AAUp;
491         PetscCallCUDA(cudaFreeHost(AiUp));
492         PetscCallCUDA(cudaFreeHost(AjUp));
493         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
494       } else {
495         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
496         /* Fill the upper triangular matrix */
497         offset = nzUpper;
498         for (i = n - 1; i >= 0; i--) {
499           v = aa + adiag[i + 1] + 1;
500 
501           /* number of elements NOT on the diagonal */
502           nz = adiag[i] - adiag[i + 1] - 1;
503 
504           /* decrement the offset */
505           offset -= (nz + 1);
506 
507           /* first, set the diagonal elements */
508           upTriFactor->AA_h[offset] = 1. / v[nz];
509           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
510         }
511         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
512         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
513       }
514     } catch (char *ex) {
515       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
516     }
517   }
518   PetscFunctionReturn(PETSC_SUCCESS);
519 }
520 
521 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
522 {
523   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
524   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
525   IS                            isrow = a->row, iscol = a->icol;
526   PetscBool                     row_identity, col_identity;
527   PetscInt                      n = A->rmap->n;
528 
529   PetscFunctionBegin;
530   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
531   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
532   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
533 
534   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
535   cusparseTriFactors->nnz = a->nz;
536 
537   A->offloadmask = PETSC_OFFLOAD_BOTH;
538   /* lower triangular indices */
539   PetscCall(ISIdentity(isrow, &row_identity));
540   if (!row_identity && !cusparseTriFactors->rpermIndices) {
541     const PetscInt *r;
542 
543     PetscCall(ISGetIndices(isrow, &r));
544     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545     cusparseTriFactors->rpermIndices->assign(r, r + n);
546     PetscCall(ISRestoreIndices(isrow, &r));
547     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548   }
549 
550   /* upper triangular indices */
551   PetscCall(ISIdentity(iscol, &col_identity));
552   if (!col_identity && !cusparseTriFactors->cpermIndices) {
553     const PetscInt *c;
554 
555     PetscCall(ISGetIndices(iscol, &c));
556     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
557     cusparseTriFactors->cpermIndices->assign(c, c + n);
558     PetscCall(ISRestoreIndices(iscol, &c));
559     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
560   }
561   PetscFunctionReturn(PETSC_SUCCESS);
562 }
563 
564 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
565 {
566   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
567   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
568   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
569   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
570   PetscInt                          *AiUp, *AjUp;
571   PetscScalar                       *AAUp;
572   PetscScalar                       *AALo;
573   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
574   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
575   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
576   const MatScalar                   *aa = b->a, *v;
577 
578   PetscFunctionBegin;
579   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
580   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
581     try {
582       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
583       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
584       if (!upTriFactor && !loTriFactor) {
585         /* Allocate Space for the upper triangular matrix */
586         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
587         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
588 
589         /* Fill the upper triangular matrix */
590         AiUp[0] = (PetscInt)0;
591         AiUp[n] = nzUpper;
592         offset  = 0;
593         for (i = 0; i < n; i++) {
594           /* set the pointers */
595           v  = aa + ai[i];
596           vj = aj + ai[i];
597           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
598 
599           /* first, set the diagonal elements */
600           AjUp[offset] = (PetscInt)i;
601           AAUp[offset] = (MatScalar)1.0 / v[nz];
602           AiUp[i]      = offset;
603           AALo[offset] = (MatScalar)1.0 / v[nz];
604 
605           offset += 1;
606           if (nz > 0) {
607             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
608             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
609             for (j = offset; j < offset + nz; j++) {
610               AAUp[j] = -AAUp[j];
611               AALo[j] = AAUp[j] / v[nz];
612             }
613             offset += nz;
614           }
615         }
616 
617         /* allocate space for the triangular factor information */
618         PetscCall(PetscNew(&upTriFactor));
619         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
620 
621         /* Create the matrix description */
622         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
623         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
624 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
625         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
626 #else
627         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
628 #endif
629         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
630         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
631 
632         /* set the matrix */
633         upTriFactor->csrMat              = new CsrMatrix;
634         upTriFactor->csrMat->num_rows    = A->rmap->n;
635         upTriFactor->csrMat->num_cols    = A->cmap->n;
636         upTriFactor->csrMat->num_entries = a->nz;
637 
638         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
639         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
640 
641         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
642         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
643 
644         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
645         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
646 
647         /* set the operation */
648         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
649 
650         /* Create the solve analysis information */
651         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
652         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
653 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
654         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
655                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
656         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
657 #endif
658 
659         /* perform the solve analysis */
660         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
661                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
662 
663         PetscCallCUDA(WaitForCUDA());
664         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
665 
666         /* assign the pointer */
667         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
668 
669         /* allocate space for the triangular factor information */
670         PetscCall(PetscNew(&loTriFactor));
671         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
672 
673         /* Create the matrix description */
674         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
675         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
676 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
677         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
678 #else
679         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
680 #endif
681         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
682         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
683 
684         /* set the operation */
685         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
686 
687         /* set the matrix */
688         loTriFactor->csrMat              = new CsrMatrix;
689         loTriFactor->csrMat->num_rows    = A->rmap->n;
690         loTriFactor->csrMat->num_cols    = A->cmap->n;
691         loTriFactor->csrMat->num_entries = a->nz;
692 
693         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
694         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
695 
696         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
697         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
698 
699         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
700         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
701 
702         /* Create the solve analysis information */
703         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
704         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
705 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
706         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
707                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
708         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
709 #endif
710 
711         /* perform the solve analysis */
712         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
713                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
714 
715         PetscCallCUDA(WaitForCUDA());
716         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
717 
718         /* assign the pointer */
719         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
720 
721         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
722         PetscCallCUDA(cudaFreeHost(AiUp));
723         PetscCallCUDA(cudaFreeHost(AjUp));
724       } else {
725         /* Fill the upper triangular matrix */
726         offset = 0;
727         for (i = 0; i < n; i++) {
728           /* set the pointers */
729           v  = aa + ai[i];
730           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
731 
732           /* first, set the diagonal elements */
733           AAUp[offset] = 1.0 / v[nz];
734           AALo[offset] = 1.0 / v[nz];
735 
736           offset += 1;
737           if (nz > 0) {
738             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739             for (j = offset; j < offset + nz; j++) {
740               AAUp[j] = -AAUp[j];
741               AALo[j] = AAUp[j] / v[nz];
742             }
743             offset += nz;
744           }
745         }
746         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
747         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
750         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751       }
752       PetscCallCUDA(cudaFreeHost(AAUp));
753       PetscCallCUDA(cudaFreeHost(AALo));
754     } catch (char *ex) {
755       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
756     }
757   }
758   PetscFunctionReturn(PETSC_SUCCESS);
759 }
760 
761 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
762 {
763   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
764   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
765   IS                            ip                 = a->row;
766   PetscBool                     perm_identity;
767   PetscInt                      n = A->rmap->n;
768 
769   PetscFunctionBegin;
770   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
771   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
772   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
773   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
774 
775   A->offloadmask = PETSC_OFFLOAD_BOTH;
776 
777   /* lower triangular indices */
778   PetscCall(ISIdentity(ip, &perm_identity));
779   if (!perm_identity) {
780     IS              iip;
781     const PetscInt *irip, *rip;
782 
783     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
784     PetscCall(ISGetIndices(iip, &irip));
785     PetscCall(ISGetIndices(ip, &rip));
786     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
787     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
788     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
789     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
790     PetscCall(ISRestoreIndices(iip, &irip));
791     PetscCall(ISDestroy(&iip));
792     PetscCall(ISRestoreIndices(ip, &rip));
793     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
794   }
795   PetscFunctionReturn(PETSC_SUCCESS);
796 }
797 
798 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
799 {
800   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
801   IS          ip = b->row;
802   PetscBool   perm_identity;
803 
804   PetscFunctionBegin;
805   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
806   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
807   B->offloadmask = PETSC_OFFLOAD_CPU;
808   /* determine which version of MatSolve needs to be used. */
809   PetscCall(ISIdentity(ip, &perm_identity));
810   if (perm_identity) {
811     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
812     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
813     B->ops->matsolve          = NULL;
814     B->ops->matsolvetranspose = NULL;
815   } else {
816     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
817     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
818     B->ops->matsolve          = NULL;
819     B->ops->matsolvetranspose = NULL;
820   }
821 
822   /* get the triangular factors */
823   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
824   PetscFunctionReturn(PETSC_SUCCESS);
825 }
826 
827 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
828 {
829   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
830   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
831   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
832   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
833   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
834   cusparseIndexBase_t                indexBase;
835   cusparseMatrixType_t               matrixType;
836   cusparseFillMode_t                 fillMode;
837   cusparseDiagType_t                 diagType;
838 
839   PetscFunctionBegin;
840   /* allocate space for the transpose of the lower triangular factor */
841   PetscCall(PetscNew(&loTriFactorT));
842   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
843 
844   /* set the matrix descriptors of the lower triangular factor */
845   matrixType = cusparseGetMatType(loTriFactor->descr);
846   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
847   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
848   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
849 
850   /* Create the matrix description */
851   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
852   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
853   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
854   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
855   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
856 
857   /* set the operation */
858   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859 
860   /* allocate GPU space for the CSC of the lower triangular factor*/
861   loTriFactorT->csrMat                 = new CsrMatrix;
862   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
863   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
864   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
865   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
866   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
867   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
868 
869   /* compute the transpose of the lower triangular factor, i.e. the CSC */
870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
871   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
872                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
873                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
874   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
875 #endif
876 
877   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
878   {
879     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
880     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
881                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
883                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
884 #else
885                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
886 #endif
887     PetscCallCUSPARSE(stat);
888   }
889 
890   PetscCallCUDA(WaitForCUDA());
891   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
892 
893   /* Create the solve analysis information */
894   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
895   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
896 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
897   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
898                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
899   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
900 #endif
901 
902   /* perform the solve analysis */
903   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
904                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
905 
906   PetscCallCUDA(WaitForCUDA());
907   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
908 
909   /* assign the pointer */
910   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
911 
912   /*********************************************/
913   /* Now the Transpose of the Upper Tri Factor */
914   /*********************************************/
915 
916   /* allocate space for the transpose of the upper triangular factor */
917   PetscCall(PetscNew(&upTriFactorT));
918   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
919 
920   /* set the matrix descriptors of the upper triangular factor */
921   matrixType = cusparseGetMatType(upTriFactor->descr);
922   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
923   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
925 
926   /* Create the matrix description */
927   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
928   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
929   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
930   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
931   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
932 
933   /* set the operation */
934   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
935 
936   /* allocate GPU space for the CSC of the upper triangular factor*/
937   upTriFactorT->csrMat                 = new CsrMatrix;
938   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
939   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
940   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
941   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
942   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
943   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
944 
945   /* compute the transpose of the upper triangular factor, i.e. the CSC */
946 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
947   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
948                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
949                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
950   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
951 #endif
952 
953   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
954   {
955     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
956     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
957                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
959                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
960 #else
961                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
962 #endif
963     PetscCallCUSPARSE(stat);
964   }
965 
966   PetscCallCUDA(WaitForCUDA());
967   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
968 
969   /* Create the solve analysis information */
970   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
971   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
972 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
973   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
974                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
975   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
976 #endif
977 
978   /* perform the solve analysis */
979   /* christ, would it have killed you to put this stuff in a function????????? */
980   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
981                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
982 
983   PetscCallCUDA(WaitForCUDA());
984   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
985 
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
988   PetscFunctionReturn(PETSC_SUCCESS);
989 }
990 
991 struct PetscScalarToPetscInt {
992   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
993 };
994 
995 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
996 {
997   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
998   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
999   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1000   cusparseStatus_t              stat;
1001   cusparseIndexBase_t           indexBase;
1002 
1003   PetscFunctionBegin;
1004   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1005   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1006   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1007   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1008   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1009   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1010   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1011   PetscCall(PetscLogGpuTimeBegin());
1012   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1013   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1014     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1015     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1016     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1017     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1018     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1019 
1020     /* set alpha and beta */
1021     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1022     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1023     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1024     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1025     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1026     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1027 
1028     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1029       CsrMatrix *matrixT      = new CsrMatrix;
1030       matstructT->mat         = matrixT;
1031       matrixT->num_rows       = A->cmap->n;
1032       matrixT->num_cols       = A->rmap->n;
1033       matrixT->num_entries    = a->nz;
1034       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1035       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1036       matrixT->values         = new THRUSTARRAY(a->nz);
1037 
1038       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1039       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1040 
1041 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1042   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1043       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1044                                indexBase, cusparse_scalartype);
1045       PetscCallCUSPARSE(stat);
1046   #else
1047       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1048            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1049 
1050            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1051            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1052            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1053         */
1054       if (matrixT->num_entries) {
1055         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1056         PetscCallCUSPARSE(stat);
1057 
1058       } else {
1059         matstructT->matDescr = NULL;
1060         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1061       }
1062   #endif
1063 #endif
1064     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1065 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1066       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1067 #else
1068       CsrMatrix *temp  = new CsrMatrix;
1069       CsrMatrix *tempT = new CsrMatrix;
1070       /* First convert HYB to CSR */
1071       temp->num_rows       = A->rmap->n;
1072       temp->num_cols       = A->cmap->n;
1073       temp->num_entries    = a->nz;
1074       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1075       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1076       temp->values         = new THRUSTARRAY(a->nz);
1077 
1078       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1079       PetscCallCUSPARSE(stat);
1080 
1081       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1082       tempT->num_rows       = A->rmap->n;
1083       tempT->num_cols       = A->cmap->n;
1084       tempT->num_entries    = a->nz;
1085       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1086       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1087       tempT->values         = new THRUSTARRAY(a->nz);
1088 
1089       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1090                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1091       PetscCallCUSPARSE(stat);
1092 
1093       /* Last, convert CSC to HYB */
1094       cusparseHybMat_t hybMat;
1095       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1096       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1097       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1098       PetscCallCUSPARSE(stat);
1099 
1100       /* assign the pointer */
1101       matstructT->mat = hybMat;
1102       A->transupdated = PETSC_TRUE;
1103       /* delete temporaries */
1104       if (tempT) {
1105         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1106         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1107         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1108         delete (CsrMatrix *)tempT;
1109       }
1110       if (temp) {
1111         if (temp->values) delete (THRUSTARRAY *)temp->values;
1112         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1113         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1114         delete (CsrMatrix *)temp;
1115       }
1116 #endif
1117     }
1118   }
1119   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1120     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1121     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1122     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1123     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1124     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1125     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1126     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1127     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1128     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1129     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1130     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1131       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1132       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1133       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1134     }
1135     if (!cusparsestruct->csr2csc_i) {
1136       THRUSTARRAY csr2csc_a(matrix->num_entries);
1137       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1138 
1139       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1140 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1141       void  *csr2cscBuffer;
1142       size_t csr2cscBufferSize;
1143       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1144                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1145       PetscCallCUSPARSE(stat);
1146       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1147 #endif
1148 
1149       if (matrix->num_entries) {
1150         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1151            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1152            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1153 
1154            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1155            should be filled with indexBase. So I just take a shortcut here.
1156         */
1157         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1158 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1159                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1160         PetscCallCUSPARSE(stat);
1161 #else
1162                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1163         PetscCallCUSPARSE(stat);
1164 #endif
1165       } else {
1166         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1167       }
1168 
1169       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1170       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1172       PetscCallCUDA(cudaFree(csr2cscBuffer));
1173 #endif
1174     }
1175     PetscCallThrust(
1176       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1177   }
1178   PetscCall(PetscLogGpuTimeEnd());
1179   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1180   /* the compressed row indices is not used for matTranspose */
1181   matstructT->cprowIndices = NULL;
1182   /* assign the pointer */
1183   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1184   A->transupdated                                = PETSC_TRUE;
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1189 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1190 {
1191   PetscInt                              n = xx->map->n;
1192   const PetscScalar                    *barray;
1193   PetscScalar                          *xarray;
1194   thrust::device_ptr<const PetscScalar> bGPU;
1195   thrust::device_ptr<PetscScalar>       xGPU;
1196   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1197   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1198   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1199   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1200 
1201   PetscFunctionBegin;
1202   /* Analyze the matrix and create the transpose ... on the fly */
1203   if (!loTriFactorT && !upTriFactorT) {
1204     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1205     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1206     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1207   }
1208 
1209   /* Get the GPU pointers */
1210   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1211   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1212   xGPU = thrust::device_pointer_cast(xarray);
1213   bGPU = thrust::device_pointer_cast(barray);
1214 
1215   PetscCall(PetscLogGpuTimeBegin());
1216   /* First, reorder with the row permutation */
1217   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1218 
1219   /* First, solve U */
1220   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1222 
1223   /* Then, solve L */
1224   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1225                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1226 
1227   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1228   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1229 
1230   /* Copy the temporary to the full solution. */
1231   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1232 
1233   /* restore */
1234   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1235   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1236   PetscCall(PetscLogGpuTimeEnd());
1237   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1238   PetscFunctionReturn(PETSC_SUCCESS);
1239 }
1240 
1241 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1242 {
1243   const PetscScalar                 *barray;
1244   PetscScalar                       *xarray;
1245   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1246   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1247   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1248   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1249 
1250   PetscFunctionBegin;
1251   /* Analyze the matrix and create the transpose ... on the fly */
1252   if (!loTriFactorT && !upTriFactorT) {
1253     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1254     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1255     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1256   }
1257 
1258   /* Get the GPU pointers */
1259   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1260   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1261 
1262   PetscCall(PetscLogGpuTimeBegin());
1263   /* First, solve U */
1264   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1265                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1266 
1267   /* Then, solve L */
1268   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1269                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1270 
1271   /* restore */
1272   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1273   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1274   PetscCall(PetscLogGpuTimeEnd());
1275   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1276   PetscFunctionReturn(PETSC_SUCCESS);
1277 }
1278 
1279 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1280 {
1281   const PetscScalar                    *barray;
1282   PetscScalar                          *xarray;
1283   thrust::device_ptr<const PetscScalar> bGPU;
1284   thrust::device_ptr<PetscScalar>       xGPU;
1285   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1286   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1287   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1288   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1289 
1290   PetscFunctionBegin;
1291   /* Get the GPU pointers */
1292   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1293   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1294   xGPU = thrust::device_pointer_cast(xarray);
1295   bGPU = thrust::device_pointer_cast(barray);
1296 
1297   PetscCall(PetscLogGpuTimeBegin());
1298   /* First, reorder with the row permutation */
1299   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1300 
1301   /* Next, solve L */
1302   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1303                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1304 
1305   /* Then, solve U */
1306   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1307                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1308 
1309   /* Last, reorder with the column permutation */
1310   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1311 
1312   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1313   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1314   PetscCall(PetscLogGpuTimeEnd());
1315   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1316   PetscFunctionReturn(PETSC_SUCCESS);
1317 }
1318 
1319 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1320 {
1321   const PetscScalar                 *barray;
1322   PetscScalar                       *xarray;
1323   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1324   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1325   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1326   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1327 
1328   PetscFunctionBegin;
1329   /* Get the GPU pointers */
1330   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1331   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1332 
1333   PetscCall(PetscLogGpuTimeBegin());
1334   /* First, solve L */
1335   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1336                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1337 
1338   /* Next, solve U */
1339   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1340                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1341 
1342   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1343   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1344   PetscCall(PetscLogGpuTimeEnd());
1345   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1346   PetscFunctionReturn(PETSC_SUCCESS);
1347 }
1348 
1349 #if CUSPARSE_VERSION >= 11500
1350 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1351 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1352 {
1353   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1354   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1355   const PetscScalar            *barray;
1356   PetscScalar                  *xarray;
1357 
1358   PetscFunctionBegin;
1359   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1360   PetscCall(VecCUDAGetArrayRead(b, &barray));
1361   PetscCall(PetscLogGpuTimeBegin());
1362 
1363   /* Solve L*y = b */
1364   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1365   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1366   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1367                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1368                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1369 
1370   /* Solve U*x = y */
1371   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1372   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1373                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1374 
1375   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1376   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1377 
1378   PetscCall(PetscLogGpuTimeEnd());
1379   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1380   PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382 
1383 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1384 {
1385   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1386   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1387   const PetscScalar            *barray;
1388   PetscScalar                  *xarray;
1389 
1390   PetscFunctionBegin;
1391   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1392     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1393     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1394                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1395 
1396     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1397     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1398     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1399     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1400     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1401   }
1402 
1403   if (!fs->updatedTransposeSpSVAnalysis) {
1404     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1405 
1406     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1407     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1408   }
1409 
1410   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1411   PetscCall(VecCUDAGetArrayRead(b, &barray));
1412   PetscCall(PetscLogGpuTimeBegin());
1413 
1414   /* Solve Ut*y = b */
1415   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1416   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1417   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1418                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1419 
1420   /* Solve Lt*x = y */
1421   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1422   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1423                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1424 
1425   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1426   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1427   PetscCall(PetscLogGpuTimeEnd());
1428   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1433 {
1434   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1435   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1436   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1437   CsrMatrix                    *Acsr;
1438   PetscInt                      m, nz;
1439   PetscBool                     flg;
1440 
1441   PetscFunctionBegin;
1442   if (PetscDefined(USE_DEBUG)) {
1443     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1444     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1445   }
1446 
1447   /* Copy A's value to fact */
1448   m  = fact->rmap->n;
1449   nz = aij->nz;
1450   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1451   Acsr = (CsrMatrix *)Acusp->mat->mat;
1452   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1453 
1454   /* Factorize fact inplace */
1455   if (m)
1456     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1457                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1458   if (PetscDefined(USE_DEBUG)) {
1459     int              numerical_zero;
1460     cusparseStatus_t status;
1461     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1462     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1463   }
1464 
1465   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1466      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1467   */
1468   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1469 
1470   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1471 
1472   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1473   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1474 
1475   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1476   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1477   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1478   fact->ops->matsolve          = NULL;
1479   fact->ops->matsolvetranspose = NULL;
1480   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1481   PetscFunctionReturn(PETSC_SUCCESS);
1482 }
1483 
1484 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1485 {
1486   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1487   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1488   PetscInt                      m, nz;
1489 
1490   PetscFunctionBegin;
1491   if (PetscDefined(USE_DEBUG)) {
1492     PetscInt  i;
1493     PetscBool flg, missing;
1494 
1495     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1496     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1497     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1498     PetscCall(MatMissingDiagonal(A, &missing, &i));
1499     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1500   }
1501 
1502   /* Free the old stale stuff */
1503   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1504 
1505   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1506      but they will not be used. Allocate them just for easy debugging.
1507    */
1508   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1509 
1510   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1511   fact->factortype             = MAT_FACTOR_ILU;
1512   fact->info.factor_mallocs    = 0;
1513   fact->info.fill_ratio_given  = info->fill;
1514   fact->info.fill_ratio_needed = 1.0;
1515 
1516   aij->row = NULL;
1517   aij->col = NULL;
1518 
1519   /* ====================================================================== */
1520   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1521   /* We'll do in-place factorization on fact                                */
1522   /* ====================================================================== */
1523   const int *Ai, *Aj;
1524 
1525   m  = fact->rmap->n;
1526   nz = aij->nz;
1527 
1528   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1529   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1530   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1531   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1532   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1533   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1534 
1535   /* ====================================================================== */
1536   /* Create descriptors for M, L, U                                         */
1537   /* ====================================================================== */
1538   cusparseFillMode_t fillMode;
1539   cusparseDiagType_t diagType;
1540 
1541   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1542   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1543   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1544 
1545   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1546     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1547     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1548     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1549     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1550   */
1551   fillMode = CUSPARSE_FILL_MODE_LOWER;
1552   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1553   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1554   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1555   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1556 
1557   fillMode = CUSPARSE_FILL_MODE_UPPER;
1558   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1559   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1560   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1561   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1562 
1563   /* ========================================================================= */
1564   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1565   /* ========================================================================= */
1566   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1567   if (m)
1568     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1569                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1570 
1571   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1572   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1573 
1574   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1575   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1576 
1577   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1578   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1579 
1580   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1581   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1582 
1583   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1584      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1585      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1586      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1587    */
1588   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1589     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1590     fs->spsvBuffer_L = fs->factBuffer_M;
1591     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1592   } else {
1593     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1594     fs->spsvBuffer_U = fs->factBuffer_M;
1595     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1596   }
1597 
1598   /* ========================================================================== */
1599   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1600   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1601   /* ========================================================================== */
1602   int              structural_zero;
1603   cusparseStatus_t status;
1604 
1605   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1606   if (m)
1607     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1608                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1609   if (PetscDefined(USE_DEBUG)) {
1610     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1611     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1612     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1613   }
1614 
1615   /* Estimate FLOPs of the numeric factorization */
1616   {
1617     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1618     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1619     PetscLogDouble flops = 0.0;
1620 
1621     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1622     Ai    = Aseq->i;
1623     Adiag = Aseq->diag;
1624     for (PetscInt i = 0; i < m; i++) {
1625       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1626         nzRow  = Ai[i + 1] - Ai[i];
1627         nzLeft = Adiag[i] - Ai[i];
1628         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1629           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1630         */
1631         nzLeft = (nzRow - 1) / 2;
1632         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1633       }
1634     }
1635     fs->numericFactFlops = flops;
1636   }
1637   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
1641 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1642 {
1643   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1644   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1645   const PetscScalar            *barray;
1646   PetscScalar                  *xarray;
1647 
1648   PetscFunctionBegin;
1649   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1650   PetscCall(VecCUDAGetArrayRead(b, &barray));
1651   PetscCall(PetscLogGpuTimeBegin());
1652 
1653   /* Solve L*y = b */
1654   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1655   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1656   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1657                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1658 
1659   /* Solve Lt*x = y */
1660   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1661   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1662                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1663 
1664   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1665   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1666 
1667   PetscCall(PetscLogGpuTimeEnd());
1668   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1669   PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671 
1672 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1673 {
1674   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1675   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1676   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1677   CsrMatrix                    *Acsr;
1678   PetscInt                      m, nz;
1679   PetscBool                     flg;
1680 
1681   PetscFunctionBegin;
1682   if (PetscDefined(USE_DEBUG)) {
1683     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1684     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1685   }
1686 
1687   /* Copy A's value to fact */
1688   m  = fact->rmap->n;
1689   nz = aij->nz;
1690   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1691   Acsr = (CsrMatrix *)Acusp->mat->mat;
1692   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1693 
1694   /* Factorize fact inplace */
1695   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1696      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1697      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1698      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1699      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1700    */
1701   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1702   if (PetscDefined(USE_DEBUG)) {
1703     int              numerical_zero;
1704     cusparseStatus_t status;
1705     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1706     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1707   }
1708 
1709   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1710 
1711   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1712     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1713   */
1714   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1715 
1716   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1717   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1718   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1719   fact->ops->matsolve          = NULL;
1720   fact->ops->matsolvetranspose = NULL;
1721   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1722   PetscFunctionReturn(PETSC_SUCCESS);
1723 }
1724 
1725 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1726 {
1727   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1728   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1729   PetscInt                      m, nz;
1730 
1731   PetscFunctionBegin;
1732   if (PetscDefined(USE_DEBUG)) {
1733     PetscInt  i;
1734     PetscBool flg, missing;
1735 
1736     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1737     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1738     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1739     PetscCall(MatMissingDiagonal(A, &missing, &i));
1740     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1741   }
1742 
1743   /* Free the old stale stuff */
1744   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1745 
1746   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1747      but they will not be used. Allocate them just for easy debugging.
1748    */
1749   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1750 
1751   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1752   fact->factortype             = MAT_FACTOR_ICC;
1753   fact->info.factor_mallocs    = 0;
1754   fact->info.fill_ratio_given  = info->fill;
1755   fact->info.fill_ratio_needed = 1.0;
1756 
1757   aij->row = NULL;
1758   aij->col = NULL;
1759 
1760   /* ====================================================================== */
1761   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1762   /* We'll do in-place factorization on fact                                */
1763   /* ====================================================================== */
1764   const int *Ai, *Aj;
1765 
1766   m  = fact->rmap->n;
1767   nz = aij->nz;
1768 
1769   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1770   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1771   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1772   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1773   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1774   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1775 
1776   /* ====================================================================== */
1777   /* Create mat descriptors for M, L                                        */
1778   /* ====================================================================== */
1779   cusparseFillMode_t fillMode;
1780   cusparseDiagType_t diagType;
1781 
1782   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1783   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1784   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1785 
1786   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1787     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1788     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1789     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1790     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1791   */
1792   fillMode = CUSPARSE_FILL_MODE_LOWER;
1793   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1794   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1795   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1796   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1797 
1798   /* ========================================================================= */
1799   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1800   /* ========================================================================= */
1801   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1802   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1803 
1804   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1805   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1806 
1807   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1808   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1809 
1810   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1811   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1812 
1813   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1814   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1815 
1816   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1817      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1818    */
1819   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1820     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1821     fs->spsvBuffer_L = fs->factBuffer_M;
1822     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1823   } else {
1824     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1825     fs->spsvBuffer_Lt = fs->factBuffer_M;
1826     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1827   }
1828 
1829   /* ========================================================================== */
1830   /* Perform analysis of ic0 on M                                               */
1831   /* The lower triangular part of M has the same sparsity pattern as L          */
1832   /* ========================================================================== */
1833   int              structural_zero;
1834   cusparseStatus_t status;
1835 
1836   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1837   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1838   if (PetscDefined(USE_DEBUG)) {
1839     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1840     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1841     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1842   }
1843 
1844   /* Estimate FLOPs of the numeric factorization */
1845   {
1846     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1847     PetscInt      *Ai, nzRow, nzLeft;
1848     PetscLogDouble flops = 0.0;
1849 
1850     Ai = Aseq->i;
1851     for (PetscInt i = 0; i < m; i++) {
1852       nzRow = Ai[i + 1] - Ai[i];
1853       if (nzRow > 1) {
1854         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1855           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1856         */
1857         nzLeft = (nzRow - 1) / 2;
1858         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1859       }
1860     }
1861     fs->numericFactFlops = flops;
1862   }
1863   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1864   PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866 #endif
1867 
1868 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1869 {
1870   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1871 
1872   PetscFunctionBegin;
1873 #if CUSPARSE_VERSION >= 11500
1874   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1875   if (cusparseTriFactors->factorizeOnDevice) {
1876     PetscCall(ISIdentity(isrow, &row_identity));
1877     PetscCall(ISIdentity(iscol, &col_identity));
1878   }
1879   if (!info->levels && row_identity && col_identity) {
1880     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1881   } else
1882 #endif
1883   {
1884     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1885     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1886     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1887   }
1888   PetscFunctionReturn(PETSC_SUCCESS);
1889 }
1890 
1891 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892 {
1893   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1894 
1895   PetscFunctionBegin;
1896   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1897   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1899   PetscFunctionReturn(PETSC_SUCCESS);
1900 }
1901 
1902 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1903 {
1904   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1905 
1906   PetscFunctionBegin;
1907 #if CUSPARSE_VERSION >= 11500
1908   PetscBool perm_identity = PETSC_FALSE;
1909   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1910   if (!info->levels && perm_identity) {
1911     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1912   } else
1913 #endif
1914   {
1915     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1916     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1917     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1918   }
1919   PetscFunctionReturn(PETSC_SUCCESS);
1920 }
1921 
1922 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1923 {
1924   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1925 
1926   PetscFunctionBegin;
1927   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1928   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1929   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1930   PetscFunctionReturn(PETSC_SUCCESS);
1931 }
1932 
1933 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1934 {
1935   PetscFunctionBegin;
1936   *type = MATSOLVERCUSPARSE;
1937   PetscFunctionReturn(PETSC_SUCCESS);
1938 }
1939 
1940 /*MC
1941   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1942   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1943   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1944   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1945   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1946   algorithms are not recommended. This class does NOT support direct solver operations.
1947 
1948   Level: beginner
1949 
1950 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1951           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1952 M*/
1953 
1954 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1955 {
1956   PetscInt  n = A->rmap->n;
1957   PetscBool factOnDevice, factOnHost;
1958   char     *prefix;
1959   char      factPlace[32] = "device"; /* the default */
1960 
1961   PetscFunctionBegin;
1962   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1963   PetscCall(MatSetSizes(*B, n, n, n, n));
1964   (*B)->factortype = ftype;
1965   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1966 
1967   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1968   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1969   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1970   PetscOptionsEnd();
1971   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1972   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1973   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1974   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1975 
1976   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1977   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1978     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1979     if (!A->boundtocpu) {
1980       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1981       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1982     } else {
1983       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1984       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1985     }
1986     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1987     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1988     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1989   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1990     if (!A->boundtocpu) {
1991       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1992       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1993     } else {
1994       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1995       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1996     }
1997     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1998     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1999   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2000 
2001   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2002   (*B)->canuseordering = PETSC_TRUE;
2003   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2004   PetscFunctionReturn(PETSC_SUCCESS);
2005 }
2006 
2007 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2008 {
2009   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2010   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2011 #if CUSPARSE_VERSION >= 13500
2012   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2013 #endif
2014 
2015   PetscFunctionBegin;
2016   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2017     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2018     if (A->factortype == MAT_FACTOR_NONE) {
2019       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2020       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2021     }
2022 #if CUSPARSE_VERSION >= 13500
2023     else if (fs->csrVal) {
2024       /* We have a factorized matrix on device and are able to copy it to host */
2025       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2026     }
2027 #endif
2028     else
2029       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2030     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2031     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2032     A->offloadmask = PETSC_OFFLOAD_BOTH;
2033   }
2034   PetscFunctionReturn(PETSC_SUCCESS);
2035 }
2036 
2037 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2038 {
2039   PetscFunctionBegin;
2040   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2041   *array = ((Mat_SeqAIJ *)A->data)->a;
2042   PetscFunctionReturn(PETSC_SUCCESS);
2043 }
2044 
2045 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2046 {
2047   PetscFunctionBegin;
2048   A->offloadmask = PETSC_OFFLOAD_CPU;
2049   *array         = NULL;
2050   PetscFunctionReturn(PETSC_SUCCESS);
2051 }
2052 
2053 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2054 {
2055   PetscFunctionBegin;
2056   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2057   *array = ((Mat_SeqAIJ *)A->data)->a;
2058   PetscFunctionReturn(PETSC_SUCCESS);
2059 }
2060 
2061 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2062 {
2063   PetscFunctionBegin;
2064   *array = NULL;
2065   PetscFunctionReturn(PETSC_SUCCESS);
2066 }
2067 
2068 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2069 {
2070   PetscFunctionBegin;
2071   *array = ((Mat_SeqAIJ *)A->data)->a;
2072   PetscFunctionReturn(PETSC_SUCCESS);
2073 }
2074 
2075 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2076 {
2077   PetscFunctionBegin;
2078   A->offloadmask = PETSC_OFFLOAD_CPU;
2079   *array         = NULL;
2080   PetscFunctionReturn(PETSC_SUCCESS);
2081 }
2082 
2083 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2084 {
2085   Mat_SeqAIJCUSPARSE *cusp;
2086   CsrMatrix          *matrix;
2087 
2088   PetscFunctionBegin;
2089   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2090   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2091   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2092   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2093   matrix = (CsrMatrix *)cusp->mat->mat;
2094 
2095   if (i) {
2096 #if !defined(PETSC_USE_64BIT_INDICES)
2097     *i = matrix->row_offsets->data().get();
2098 #else
2099     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2100 #endif
2101   }
2102   if (j) {
2103 #if !defined(PETSC_USE_64BIT_INDICES)
2104     *j = matrix->column_indices->data().get();
2105 #else
2106     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2107 #endif
2108   }
2109   if (a) *a = matrix->values->data().get();
2110   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2111   PetscFunctionReturn(PETSC_SUCCESS);
2112 }
2113 
2114 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2115 {
2116   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2117   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2118   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2119   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2120   cusparseStatus_t              stat;
2121   PetscBool                     both = PETSC_TRUE;
2122 
2123   PetscFunctionBegin;
2124   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2125   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2126     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2127       CsrMatrix *matrix;
2128       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2129 
2130       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2131       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2132       matrix->values->assign(a->a, a->a + a->nz);
2133       PetscCallCUDA(WaitForCUDA());
2134       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2135       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2136       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2137     } else {
2138       PetscInt nnz;
2139       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2140       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2141       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2142       delete cusparsestruct->workVector;
2143       delete cusparsestruct->rowoffsets_gpu;
2144       cusparsestruct->workVector     = NULL;
2145       cusparsestruct->rowoffsets_gpu = NULL;
2146       try {
2147         if (a->compressedrow.use) {
2148           m    = a->compressedrow.nrows;
2149           ii   = a->compressedrow.i;
2150           ridx = a->compressedrow.rindex;
2151         } else {
2152           m    = A->rmap->n;
2153           ii   = a->i;
2154           ridx = NULL;
2155         }
2156         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2157         if (!a->a) {
2158           nnz  = ii[m];
2159           both = PETSC_FALSE;
2160         } else nnz = a->nz;
2161         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2162 
2163         /* create cusparse matrix */
2164         cusparsestruct->nrows = m;
2165         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2166         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2167         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2168         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2169 
2170         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2171         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2172         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2173         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2174         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2175         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2176         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2177 
2178         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2179         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2180           /* set the matrix */
2181           CsrMatrix *mat   = new CsrMatrix;
2182           mat->num_rows    = m;
2183           mat->num_cols    = A->cmap->n;
2184           mat->num_entries = nnz;
2185           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2186           mat->row_offsets->assign(ii, ii + m + 1);
2187 
2188           mat->column_indices = new THRUSTINTARRAY32(nnz);
2189           mat->column_indices->assign(a->j, a->j + nnz);
2190 
2191           mat->values = new THRUSTARRAY(nnz);
2192           if (a->a) mat->values->assign(a->a, a->a + nnz);
2193 
2194           /* assign the pointer */
2195           matstruct->mat = mat;
2196 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2197           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2198             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2199                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2200             PetscCallCUSPARSE(stat);
2201           }
2202 #endif
2203         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2205           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2206 #else
2207           CsrMatrix *mat   = new CsrMatrix;
2208           mat->num_rows    = m;
2209           mat->num_cols    = A->cmap->n;
2210           mat->num_entries = nnz;
2211           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2212           mat->row_offsets->assign(ii, ii + m + 1);
2213 
2214           mat->column_indices = new THRUSTINTARRAY32(nnz);
2215           mat->column_indices->assign(a->j, a->j + nnz);
2216 
2217           mat->values = new THRUSTARRAY(nnz);
2218           if (a->a) mat->values->assign(a->a, a->a + nnz);
2219 
2220           cusparseHybMat_t hybMat;
2221           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2222           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2223           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2224           PetscCallCUSPARSE(stat);
2225           /* assign the pointer */
2226           matstruct->mat = hybMat;
2227 
2228           if (mat) {
2229             if (mat->values) delete (THRUSTARRAY *)mat->values;
2230             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2231             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2232             delete (CsrMatrix *)mat;
2233           }
2234 #endif
2235         }
2236 
2237         /* assign the compressed row indices */
2238         if (a->compressedrow.use) {
2239           cusparsestruct->workVector = new THRUSTARRAY(m);
2240           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2241           matstruct->cprowIndices->assign(ridx, ridx + m);
2242           tmp = m;
2243         } else {
2244           cusparsestruct->workVector = NULL;
2245           matstruct->cprowIndices    = NULL;
2246           tmp                        = 0;
2247         }
2248         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2249 
2250         /* assign the pointer */
2251         cusparsestruct->mat = matstruct;
2252       } catch (char *ex) {
2253         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2254       }
2255       PetscCallCUDA(WaitForCUDA());
2256       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2257       cusparsestruct->nonzerostate = A->nonzerostate;
2258     }
2259     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2260   }
2261   PetscFunctionReturn(PETSC_SUCCESS);
2262 }
2263 
2264 struct VecCUDAPlusEquals {
2265   template <typename Tuple>
2266   __host__ __device__ void operator()(Tuple t)
2267   {
2268     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2269   }
2270 };
2271 
2272 struct VecCUDAEquals {
2273   template <typename Tuple>
2274   __host__ __device__ void operator()(Tuple t)
2275   {
2276     thrust::get<1>(t) = thrust::get<0>(t);
2277   }
2278 };
2279 
2280 struct VecCUDAEqualsReverse {
2281   template <typename Tuple>
2282   __host__ __device__ void operator()(Tuple t)
2283   {
2284     thrust::get<0>(t) = thrust::get<1>(t);
2285   }
2286 };
2287 
2288 struct MatMatCusparse {
2289   PetscBool      cisdense;
2290   PetscScalar   *Bt;
2291   Mat            X;
2292   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2293   PetscLogDouble flops;
2294   CsrMatrix     *Bcsr;
2295 
2296 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2297   cusparseSpMatDescr_t matSpBDescr;
2298   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2299   cusparseDnMatDescr_t matBDescr;
2300   cusparseDnMatDescr_t matCDescr;
2301   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2302   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2303   void *dBuffer4;
2304   void *dBuffer5;
2305   #endif
2306   size_t                mmBufferSize;
2307   void                 *mmBuffer;
2308   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2309   cusparseSpGEMMDescr_t spgemmDesc;
2310 #endif
2311 };
2312 
2313 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2314 {
2315   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2316 
2317   PetscFunctionBegin;
2318   PetscCallCUDA(cudaFree(mmdata->Bt));
2319   delete mmdata->Bcsr;
2320 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2321   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2322   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2323   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2324   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2325   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2327   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2328   #endif
2329   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2330   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2331 #endif
2332   PetscCall(MatDestroy(&mmdata->X));
2333   PetscCall(PetscFree(data));
2334   PetscFunctionReturn(PETSC_SUCCESS);
2335 }
2336 
2337 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2338 
2339 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2340 {
2341   Mat_Product                  *product = C->product;
2342   Mat                           A, B;
2343   PetscInt                      m, n, blda, clda;
2344   PetscBool                     flg, biscuda;
2345   Mat_SeqAIJCUSPARSE           *cusp;
2346   cusparseStatus_t              stat;
2347   cusparseOperation_t           opA;
2348   const PetscScalar            *barray;
2349   PetscScalar                  *carray;
2350   MatMatCusparse               *mmdata;
2351   Mat_SeqAIJCUSPARSEMultStruct *mat;
2352   CsrMatrix                    *csrmat;
2353 
2354   PetscFunctionBegin;
2355   MatCheckProduct(C, 1);
2356   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2357   mmdata = (MatMatCusparse *)product->data;
2358   A      = product->A;
2359   B      = product->B;
2360   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2361   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2362   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2363      Instead of silently accepting the wrong answer, I prefer to raise the error */
2364   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2365   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2366   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2367   switch (product->type) {
2368   case MATPRODUCT_AB:
2369   case MATPRODUCT_PtAP:
2370     mat = cusp->mat;
2371     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2372     m   = A->rmap->n;
2373     n   = B->cmap->n;
2374     break;
2375   case MATPRODUCT_AtB:
2376     if (!A->form_explicit_transpose) {
2377       mat = cusp->mat;
2378       opA = CUSPARSE_OPERATION_TRANSPOSE;
2379     } else {
2380       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2381       mat = cusp->matTranspose;
2382       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2383     }
2384     m = A->cmap->n;
2385     n = B->cmap->n;
2386     break;
2387   case MATPRODUCT_ABt:
2388   case MATPRODUCT_RARt:
2389     mat = cusp->mat;
2390     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2391     m   = A->rmap->n;
2392     n   = B->rmap->n;
2393     break;
2394   default:
2395     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2396   }
2397   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2398   csrmat = (CsrMatrix *)mat->mat;
2399   /* if the user passed a CPU matrix, copy the data to the GPU */
2400   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2401   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2402   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2403 
2404   PetscCall(MatDenseGetLDA(B, &blda));
2405   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2406     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2407     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2408   } else {
2409     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2410     PetscCall(MatDenseGetLDA(C, &clda));
2411   }
2412 
2413   PetscCall(PetscLogGpuTimeBegin());
2414 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2415   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2416   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2417   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2418     size_t mmBufferSize;
2419     if (mmdata->initialized && mmdata->Blda != blda) {
2420       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2421       mmdata->matBDescr = NULL;
2422     }
2423     if (!mmdata->matBDescr) {
2424       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2425       mmdata->Blda = blda;
2426     }
2427 
2428     if (mmdata->initialized && mmdata->Clda != clda) {
2429       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2430       mmdata->matCDescr = NULL;
2431     }
2432     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2433       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2434       mmdata->Clda = clda;
2435     }
2436 
2437     if (!mat->matDescr) {
2438       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2439                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2440       PetscCallCUSPARSE(stat);
2441     }
2442     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2443     PetscCallCUSPARSE(stat);
2444     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2445       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2446       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2447       mmdata->mmBufferSize = mmBufferSize;
2448     }
2449     mmdata->initialized = PETSC_TRUE;
2450   } else {
2451     /* to be safe, always update pointers of the mats */
2452     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2453     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2454     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2455   }
2456 
2457   /* do cusparseSpMM, which supports transpose on B */
2458   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2459   PetscCallCUSPARSE(stat);
2460 #else
2461   PetscInt k;
2462   /* cusparseXcsrmm does not support transpose on B */
2463   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2464     cublasHandle_t cublasv2handle;
2465     cublasStatus_t cerr;
2466 
2467     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2468     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2469     PetscCallCUBLAS(cerr);
2470     blda = B->cmap->n;
2471     k    = B->cmap->n;
2472   } else {
2473     k = B->rmap->n;
2474   }
2475 
2476   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2477   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2478   PetscCallCUSPARSE(stat);
2479 #endif
2480   PetscCall(PetscLogGpuTimeEnd());
2481   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2482   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2483   if (product->type == MATPRODUCT_RARt) {
2484     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2485     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2486   } else if (product->type == MATPRODUCT_PtAP) {
2487     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2488     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2489   } else {
2490     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2491   }
2492   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2493   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2494   PetscFunctionReturn(PETSC_SUCCESS);
2495 }
2496 
2497 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2498 {
2499   Mat_Product        *product = C->product;
2500   Mat                 A, B;
2501   PetscInt            m, n;
2502   PetscBool           cisdense, flg;
2503   MatMatCusparse     *mmdata;
2504   Mat_SeqAIJCUSPARSE *cusp;
2505 
2506   PetscFunctionBegin;
2507   MatCheckProduct(C, 1);
2508   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2509   A = product->A;
2510   B = product->B;
2511   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2512   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2513   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2514   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2515   switch (product->type) {
2516   case MATPRODUCT_AB:
2517     m = A->rmap->n;
2518     n = B->cmap->n;
2519     break;
2520   case MATPRODUCT_AtB:
2521     m = A->cmap->n;
2522     n = B->cmap->n;
2523     break;
2524   case MATPRODUCT_ABt:
2525     m = A->rmap->n;
2526     n = B->rmap->n;
2527     break;
2528   case MATPRODUCT_PtAP:
2529     m = B->cmap->n;
2530     n = B->cmap->n;
2531     break;
2532   case MATPRODUCT_RARt:
2533     m = B->rmap->n;
2534     n = B->rmap->n;
2535     break;
2536   default:
2537     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2538   }
2539   PetscCall(MatSetSizes(C, m, n, m, n));
2540   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2541   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2542   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2543 
2544   /* product data */
2545   PetscCall(PetscNew(&mmdata));
2546   mmdata->cisdense = cisdense;
2547 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2548   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2549   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2550 #endif
2551   /* for these products we need intermediate storage */
2552   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2553     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2554     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2555     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2556       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2557     } else {
2558       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2559     }
2560   }
2561   C->product->data    = mmdata;
2562   C->product->destroy = MatDestroy_MatMatCusparse;
2563 
2564   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2565   PetscFunctionReturn(PETSC_SUCCESS);
2566 }
2567 
2568 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2569 {
2570   Mat_Product                  *product = C->product;
2571   Mat                           A, B;
2572   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2573   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2574   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2575   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2576   PetscBool                     flg;
2577   cusparseStatus_t              stat;
2578   MatProductType                ptype;
2579   MatMatCusparse               *mmdata;
2580 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2581   cusparseSpMatDescr_t BmatSpDescr;
2582 #endif
2583   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2584 
2585   PetscFunctionBegin;
2586   MatCheckProduct(C, 1);
2587   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2588   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2589   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2590   mmdata = (MatMatCusparse *)C->product->data;
2591   A      = product->A;
2592   B      = product->B;
2593   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2594     mmdata->reusesym = PETSC_FALSE;
2595     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2596     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2597     Cmat = Ccusp->mat;
2598     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2599     Ccsr = (CsrMatrix *)Cmat->mat;
2600     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2601     goto finalize;
2602   }
2603   if (!c->nz) goto finalize;
2604   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2605   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2606   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2607   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2608   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2609   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2610   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2611   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2612   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2613   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2614   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2615   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2616   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2617   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2618 
2619   ptype = product->type;
2620   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2621     ptype = MATPRODUCT_AB;
2622     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2623   }
2624   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2625     ptype = MATPRODUCT_AB;
2626     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2627   }
2628   switch (ptype) {
2629   case MATPRODUCT_AB:
2630     Amat = Acusp->mat;
2631     Bmat = Bcusp->mat;
2632     break;
2633   case MATPRODUCT_AtB:
2634     Amat = Acusp->matTranspose;
2635     Bmat = Bcusp->mat;
2636     break;
2637   case MATPRODUCT_ABt:
2638     Amat = Acusp->mat;
2639     Bmat = Bcusp->matTranspose;
2640     break;
2641   default:
2642     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2643   }
2644   Cmat = Ccusp->mat;
2645   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2646   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2647   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2648   Acsr = (CsrMatrix *)Amat->mat;
2649   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2650   Ccsr = (CsrMatrix *)Cmat->mat;
2651   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2652   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2653   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2654   PetscCall(PetscLogGpuTimeBegin());
2655 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2656   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2657   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2658   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2659   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2660   PetscCallCUSPARSE(stat);
2661   #else
2662   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2663   PetscCallCUSPARSE(stat);
2664   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2665   PetscCallCUSPARSE(stat);
2666   #endif
2667 #else
2668   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2669                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2670   PetscCallCUSPARSE(stat);
2671 #endif
2672   PetscCall(PetscLogGpuFlops(mmdata->flops));
2673   PetscCallCUDA(WaitForCUDA());
2674   PetscCall(PetscLogGpuTimeEnd());
2675   C->offloadmask = PETSC_OFFLOAD_GPU;
2676 finalize:
2677   /* shorter version of MatAssemblyEnd_SeqAIJ */
2678   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2679   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2680   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2681   c->reallocs = 0;
2682   C->info.mallocs += 0;
2683   C->info.nz_unneeded = 0;
2684   C->assembled = C->was_assembled = PETSC_TRUE;
2685   C->num_ass++;
2686   PetscFunctionReturn(PETSC_SUCCESS);
2687 }
2688 
2689 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2690 {
2691   Mat_Product                  *product = C->product;
2692   Mat                           A, B;
2693   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2694   Mat_SeqAIJ                   *a, *b, *c;
2695   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2696   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2697   PetscInt                      i, j, m, n, k;
2698   PetscBool                     flg;
2699   cusparseStatus_t              stat;
2700   MatProductType                ptype;
2701   MatMatCusparse               *mmdata;
2702   PetscLogDouble                flops;
2703   PetscBool                     biscompressed, ciscompressed;
2704 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2705   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2706   cusparseSpMatDescr_t BmatSpDescr;
2707 #else
2708   int cnz;
2709 #endif
2710   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2711 
2712   PetscFunctionBegin;
2713   MatCheckProduct(C, 1);
2714   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2715   A = product->A;
2716   B = product->B;
2717   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2718   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2719   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2720   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2721   a = (Mat_SeqAIJ *)A->data;
2722   b = (Mat_SeqAIJ *)B->data;
2723   /* product data */
2724   PetscCall(PetscNew(&mmdata));
2725   C->product->data    = mmdata;
2726   C->product->destroy = MatDestroy_MatMatCusparse;
2727 
2728   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2729   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2730   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2731   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2732   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2733   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2734 
2735   ptype = product->type;
2736   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2737     ptype                                          = MATPRODUCT_AB;
2738     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2739   }
2740   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2741     ptype                                          = MATPRODUCT_AB;
2742     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2743   }
2744   biscompressed = PETSC_FALSE;
2745   ciscompressed = PETSC_FALSE;
2746   switch (ptype) {
2747   case MATPRODUCT_AB:
2748     m    = A->rmap->n;
2749     n    = B->cmap->n;
2750     k    = A->cmap->n;
2751     Amat = Acusp->mat;
2752     Bmat = Bcusp->mat;
2753     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2754     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2755     break;
2756   case MATPRODUCT_AtB:
2757     m = A->cmap->n;
2758     n = B->cmap->n;
2759     k = A->rmap->n;
2760     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2761     Amat = Acusp->matTranspose;
2762     Bmat = Bcusp->mat;
2763     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2764     break;
2765   case MATPRODUCT_ABt:
2766     m = A->rmap->n;
2767     n = B->rmap->n;
2768     k = A->cmap->n;
2769     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2770     Amat = Acusp->mat;
2771     Bmat = Bcusp->matTranspose;
2772     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2773     break;
2774   default:
2775     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2776   }
2777 
2778   /* create cusparse matrix */
2779   PetscCall(MatSetSizes(C, m, n, m, n));
2780   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2781   c     = (Mat_SeqAIJ *)C->data;
2782   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2783   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2784   Ccsr  = new CsrMatrix;
2785 
2786   c->compressedrow.use = ciscompressed;
2787   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2788     c->compressedrow.nrows = a->compressedrow.nrows;
2789     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2790     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2791     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2792     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2793     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2794   } else {
2795     c->compressedrow.nrows  = 0;
2796     c->compressedrow.i      = NULL;
2797     c->compressedrow.rindex = NULL;
2798     Ccusp->workVector       = NULL;
2799     Cmat->cprowIndices      = NULL;
2800   }
2801   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2802   Ccusp->mat        = Cmat;
2803   Ccusp->mat->mat   = Ccsr;
2804   Ccsr->num_rows    = Ccusp->nrows;
2805   Ccsr->num_cols    = n;
2806   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2807   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2808   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2809   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2810   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2811   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2812   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2813   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2814   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2815   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2816   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2817     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2818     c->nz                = 0;
2819     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2820     Ccsr->values         = new THRUSTARRAY(c->nz);
2821     goto finalizesym;
2822   }
2823 
2824   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2825   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2826   Acsr = (CsrMatrix *)Amat->mat;
2827   if (!biscompressed) {
2828     Bcsr = (CsrMatrix *)Bmat->mat;
2829 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2830     BmatSpDescr = Bmat->matDescr;
2831 #endif
2832   } else { /* we need to use row offsets for the full matrix */
2833     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2834     Bcsr                 = new CsrMatrix;
2835     Bcsr->num_rows       = B->rmap->n;
2836     Bcsr->num_cols       = cBcsr->num_cols;
2837     Bcsr->num_entries    = cBcsr->num_entries;
2838     Bcsr->column_indices = cBcsr->column_indices;
2839     Bcsr->values         = cBcsr->values;
2840     if (!Bcusp->rowoffsets_gpu) {
2841       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2842       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2843       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2844     }
2845     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2846     mmdata->Bcsr      = Bcsr;
2847 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2848     if (Bcsr->num_rows && Bcsr->num_cols) {
2849       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2850       PetscCallCUSPARSE(stat);
2851     }
2852     BmatSpDescr = mmdata->matSpBDescr;
2853 #endif
2854   }
2855   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2856   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2857   /* precompute flops count */
2858   if (ptype == MATPRODUCT_AB) {
2859     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2860       const PetscInt st = a->i[i];
2861       const PetscInt en = a->i[i + 1];
2862       for (j = st; j < en; j++) {
2863         const PetscInt brow = a->j[j];
2864         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2865       }
2866     }
2867   } else if (ptype == MATPRODUCT_AtB) {
2868     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2869       const PetscInt anzi = a->i[i + 1] - a->i[i];
2870       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2871       flops += (2. * anzi) * bnzi;
2872     }
2873   } else { /* TODO */
2874     flops = 0.;
2875   }
2876 
2877   mmdata->flops = flops;
2878   PetscCall(PetscLogGpuTimeBegin());
2879 
2880 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2881   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2882   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2883   PetscCallCUSPARSE(stat);
2884   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2885   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2886   {
2887     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2888      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2889   */
2890     void *dBuffer1 = NULL;
2891     void *dBuffer2 = NULL;
2892     void *dBuffer3 = NULL;
2893     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2894     size_t bufferSize1 = 0;
2895     size_t bufferSize2 = 0;
2896     size_t bufferSize3 = 0;
2897     size_t bufferSize4 = 0;
2898     size_t bufferSize5 = 0;
2899 
2900     /* ask bufferSize1 bytes for external memory */
2901     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2902     PetscCallCUSPARSE(stat);
2903     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2904     /* inspect the matrices A and B to understand the memory requirement for the next step */
2905     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2906     PetscCallCUSPARSE(stat);
2907 
2908     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2909     PetscCallCUSPARSE(stat);
2910     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2911     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2912     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2913     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2914     PetscCallCUSPARSE(stat);
2915     PetscCallCUDA(cudaFree(dBuffer1));
2916     PetscCallCUDA(cudaFree(dBuffer2));
2917 
2918     /* get matrix C non-zero entries C_nnz1 */
2919     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2920     c->nz = (PetscInt)C_nnz1;
2921     /* allocate matrix C */
2922     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2923     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2924     Ccsr->values = new THRUSTARRAY(c->nz);
2925     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2926     /* update matC with the new pointers */
2927     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2928     PetscCallCUSPARSE(stat);
2929 
2930     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2931     PetscCallCUSPARSE(stat);
2932     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2933     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2934     PetscCallCUSPARSE(stat);
2935     PetscCallCUDA(cudaFree(dBuffer3));
2936     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2937     PetscCallCUSPARSE(stat);
2938     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2939   }
2940   #else
2941   size_t bufSize2;
2942   /* ask bufferSize bytes for external memory */
2943   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2944   PetscCallCUSPARSE(stat);
2945   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2946   /* inspect the matrices A and B to understand the memory requirement for the next step */
2947   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2948   PetscCallCUSPARSE(stat);
2949   /* ask bufferSize again bytes for external memory */
2950   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2951   PetscCallCUSPARSE(stat);
2952   /* The CUSPARSE documentation is not clear, nor the API
2953      We need both buffers to perform the operations properly!
2954      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2955      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2956      is stored in the descriptor! What a messy API... */
2957   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2958   /* compute the intermediate product of A * B */
2959   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2960   PetscCallCUSPARSE(stat);
2961   /* get matrix C non-zero entries C_nnz1 */
2962   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2963   c->nz = (PetscInt)C_nnz1;
2964   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2965                       mmdata->mmBufferSize / 1024));
2966   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2967   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2968   Ccsr->values = new THRUSTARRAY(c->nz);
2969   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2970   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2971   PetscCallCUSPARSE(stat);
2972   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2973   PetscCallCUSPARSE(stat);
2974   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2975 #else
2976   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2977   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2978                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2979   PetscCallCUSPARSE(stat);
2980   c->nz                = cnz;
2981   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2982   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2983   Ccsr->values = new THRUSTARRAY(c->nz);
2984   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2985 
2986   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2987   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2988      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2989      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2990   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2991                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2992   PetscCallCUSPARSE(stat);
2993 #endif
2994   PetscCall(PetscLogGpuFlops(mmdata->flops));
2995   PetscCall(PetscLogGpuTimeEnd());
2996 finalizesym:
2997   c->singlemalloc = PETSC_FALSE;
2998   c->free_a       = PETSC_TRUE;
2999   c->free_ij      = PETSC_TRUE;
3000   PetscCall(PetscMalloc1(m + 1, &c->i));
3001   PetscCall(PetscMalloc1(c->nz, &c->j));
3002   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3003     PetscInt      *d_i = c->i;
3004     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3005     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3006     ii = *Ccsr->row_offsets;
3007     jj = *Ccsr->column_indices;
3008     if (ciscompressed) d_i = c->compressedrow.i;
3009     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3010     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3011   } else {
3012     PetscInt *d_i = c->i;
3013     if (ciscompressed) d_i = c->compressedrow.i;
3014     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3015     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3016   }
3017   if (ciscompressed) { /* need to expand host row offsets */
3018     PetscInt r = 0;
3019     c->i[0]    = 0;
3020     for (k = 0; k < c->compressedrow.nrows; k++) {
3021       const PetscInt next = c->compressedrow.rindex[k];
3022       const PetscInt old  = c->compressedrow.i[k];
3023       for (; r < next; r++) c->i[r + 1] = old;
3024     }
3025     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3026   }
3027   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3028   PetscCall(PetscMalloc1(m, &c->ilen));
3029   PetscCall(PetscMalloc1(m, &c->imax));
3030   c->maxnz         = c->nz;
3031   c->nonzerorowcnt = 0;
3032   c->rmax          = 0;
3033   for (k = 0; k < m; k++) {
3034     const PetscInt nn = c->i[k + 1] - c->i[k];
3035     c->ilen[k] = c->imax[k] = nn;
3036     c->nonzerorowcnt += (PetscInt) !!nn;
3037     c->rmax = PetscMax(c->rmax, nn);
3038   }
3039   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3040   PetscCall(PetscMalloc1(c->nz, &c->a));
3041   Ccsr->num_entries = c->nz;
3042 
3043   C->nonzerostate++;
3044   PetscCall(PetscLayoutSetUp(C->rmap));
3045   PetscCall(PetscLayoutSetUp(C->cmap));
3046   Ccusp->nonzerostate = C->nonzerostate;
3047   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3048   C->preallocated     = PETSC_TRUE;
3049   C->assembled        = PETSC_FALSE;
3050   C->was_assembled    = PETSC_FALSE;
3051   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3052     mmdata->reusesym = PETSC_TRUE;
3053     C->offloadmask   = PETSC_OFFLOAD_GPU;
3054   }
3055   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3056   PetscFunctionReturn(PETSC_SUCCESS);
3057 }
3058 
3059 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3060 
3061 /* handles sparse or dense B */
3062 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3063 {
3064   Mat_Product *product = mat->product;
3065   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3066 
3067   PetscFunctionBegin;
3068   MatCheckProduct(mat, 1);
3069   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3070   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3071   if (product->type == MATPRODUCT_ABC) {
3072     Ciscusp = PETSC_FALSE;
3073     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3074   }
3075   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3076     PetscBool usecpu = PETSC_FALSE;
3077     switch (product->type) {
3078     case MATPRODUCT_AB:
3079       if (product->api_user) {
3080         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3081         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3082         PetscOptionsEnd();
3083       } else {
3084         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3085         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3086         PetscOptionsEnd();
3087       }
3088       break;
3089     case MATPRODUCT_AtB:
3090       if (product->api_user) {
3091         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3092         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3093         PetscOptionsEnd();
3094       } else {
3095         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3096         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3097         PetscOptionsEnd();
3098       }
3099       break;
3100     case MATPRODUCT_PtAP:
3101       if (product->api_user) {
3102         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3103         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3104         PetscOptionsEnd();
3105       } else {
3106         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3107         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3108         PetscOptionsEnd();
3109       }
3110       break;
3111     case MATPRODUCT_RARt:
3112       if (product->api_user) {
3113         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3114         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3115         PetscOptionsEnd();
3116       } else {
3117         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3118         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3119         PetscOptionsEnd();
3120       }
3121       break;
3122     case MATPRODUCT_ABC:
3123       if (product->api_user) {
3124         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3125         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3126         PetscOptionsEnd();
3127       } else {
3128         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3129         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3130         PetscOptionsEnd();
3131       }
3132       break;
3133     default:
3134       break;
3135     }
3136     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3137   }
3138   /* dispatch */
3139   if (isdense) {
3140     switch (product->type) {
3141     case MATPRODUCT_AB:
3142     case MATPRODUCT_AtB:
3143     case MATPRODUCT_ABt:
3144     case MATPRODUCT_PtAP:
3145     case MATPRODUCT_RARt:
3146       if (product->A->boundtocpu) {
3147         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3148       } else {
3149         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3150       }
3151       break;
3152     case MATPRODUCT_ABC:
3153       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3154       break;
3155     default:
3156       break;
3157     }
3158   } else if (Biscusp && Ciscusp) {
3159     switch (product->type) {
3160     case MATPRODUCT_AB:
3161     case MATPRODUCT_AtB:
3162     case MATPRODUCT_ABt:
3163       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3164       break;
3165     case MATPRODUCT_PtAP:
3166     case MATPRODUCT_RARt:
3167     case MATPRODUCT_ABC:
3168       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3169       break;
3170     default:
3171       break;
3172     }
3173   } else { /* fallback for AIJ */
3174     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3175   }
3176   PetscFunctionReturn(PETSC_SUCCESS);
3177 }
3178 
3179 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3180 {
3181   PetscFunctionBegin;
3182   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3183   PetscFunctionReturn(PETSC_SUCCESS);
3184 }
3185 
3186 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3187 {
3188   PetscFunctionBegin;
3189   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3190   PetscFunctionReturn(PETSC_SUCCESS);
3191 }
3192 
3193 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3194 {
3195   PetscFunctionBegin;
3196   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3197   PetscFunctionReturn(PETSC_SUCCESS);
3198 }
3199 
3200 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3201 {
3202   PetscFunctionBegin;
3203   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3204   PetscFunctionReturn(PETSC_SUCCESS);
3205 }
3206 
3207 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3208 {
3209   PetscFunctionBegin;
3210   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3211   PetscFunctionReturn(PETSC_SUCCESS);
3212 }
3213 
3214 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3215 {
3216   int i = blockIdx.x * blockDim.x + threadIdx.x;
3217   if (i < n) y[idx[i]] += x[i];
3218 }
3219 
3220 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3221 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3222 {
3223   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3224   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3225   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3226   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3227   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3228   PetscBool                     compressed;
3229 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3230   PetscInt nx, ny;
3231 #endif
3232 
3233   PetscFunctionBegin;
3234   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3235   if (!a->nz) {
3236     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3237     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3238     PetscFunctionReturn(PETSC_SUCCESS);
3239   }
3240   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3241   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3242   if (!trans) {
3243     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3244     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3245   } else {
3246     if (herm || !A->form_explicit_transpose) {
3247       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3248       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3249     } else {
3250       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3251       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3252     }
3253   }
3254   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3255   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3256 
3257   try {
3258     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3259     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3260     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3261 
3262     PetscCall(PetscLogGpuTimeBegin());
3263     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3264       /* z = A x + beta y.
3265          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3266          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3267       */
3268       xptr = xarray;
3269       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3270       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3271 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3272       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3273           allocated to accommodate different uses. So we get the length info directly from mat.
3274        */
3275       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3276         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3277         nx             = mat->num_cols;
3278         ny             = mat->num_rows;
3279       }
3280 #endif
3281     } else {
3282       /* z = A^T x + beta y
3283          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3284          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3285        */
3286       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3287       dptr = zarray;
3288       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3289       if (compressed) { /* Scatter x to work vector */
3290         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3291 
3292         thrust::for_each(
3293 #if PetscDefined(HAVE_THRUST_ASYNC)
3294           thrust::cuda::par.on(PetscDefaultCudaStream),
3295 #endif
3296           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3297           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3298       }
3299 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3300       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302         nx             = mat->num_rows;
3303         ny             = mat->num_cols;
3304       }
3305 #endif
3306     }
3307 
3308     /* csr_spmv does y = alpha op(A) x + beta y */
3309     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3310 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3311       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3312       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3313         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3314         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3315         PetscCallCUSPARSE(
3316           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3317         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3318 
3319         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3320       } else {
3321         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3322         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3323         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3324       }
3325 
3326       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3327                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3328 #else
3329       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3330       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3331 #endif
3332     } else {
3333       if (cusparsestruct->nrows) {
3334 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3335         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3336 #else
3337         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3338         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3339 #endif
3340       }
3341     }
3342     PetscCall(PetscLogGpuTimeEnd());
3343 
3344     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3345       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3346         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3347           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3348         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3349           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3350         }
3351       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3352         PetscCall(VecSeq_CUDA::Set(zz, 0));
3353       }
3354 
3355       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3356       if (compressed) {
3357         PetscCall(PetscLogGpuTimeBegin());
3358         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3359            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3360            prevent that. So I just add a ScatterAdd kernel.
3361          */
3362 #if 0
3363         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3364         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3365                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3366                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3367                          VecCUDAPlusEquals());
3368 #else
3369         PetscInt n = matstruct->cprowIndices->size();
3370         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3371 #endif
3372         PetscCall(PetscLogGpuTimeEnd());
3373       }
3374     } else {
3375       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3376     }
3377     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3378     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3379     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3380   } catch (char *ex) {
3381     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3382   }
3383   if (yy) {
3384     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3385   } else {
3386     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3387   }
3388   PetscFunctionReturn(PETSC_SUCCESS);
3389 }
3390 
3391 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3392 {
3393   PetscFunctionBegin;
3394   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3395   PetscFunctionReturn(PETSC_SUCCESS);
3396 }
3397 
3398 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3399 {
3400   PetscObjectState    onnz = A->nonzerostate;
3401   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3402 
3403   PetscFunctionBegin;
3404   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3405   if (onnz != A->nonzerostate && cusp->deviceMat) {
3406     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3407     PetscCallCUDA(cudaFree(cusp->deviceMat));
3408     cusp->deviceMat = NULL;
3409   }
3410   PetscFunctionReturn(PETSC_SUCCESS);
3411 }
3412 
3413 /*@
3414    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3415    (the default parallel PETSc format). This matrix will ultimately pushed down
3416    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3417    assembly performance the user should preallocate the matrix storage by setting
3418    the parameter `nz` (or the array `nnz`).
3419 
3420    Collective
3421 
3422    Input Parameters:
3423 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3424 .  m - number of rows
3425 .  n - number of columns
3426 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3427 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3428 
3429    Output Parameter:
3430 .  A - the matrix
3431 
3432    Level: intermediate
3433 
3434    Notes:
3435    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3436    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3437    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3438 
3439    The AIJ format, also called
3440    compressed row storage, is fully compatible with standard Fortran
3441    storage.  That is, the stored row and column indices can begin at
3442    either one (as in Fortran) or zero.
3443 
3444    Specify the preallocated storage with either nz or nnz (not both).
3445    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3446    allocation.
3447 
3448 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3449 @*/
3450 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3451 {
3452   PetscFunctionBegin;
3453   PetscCall(MatCreate(comm, A));
3454   PetscCall(MatSetSizes(*A, m, n, m, n));
3455   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3456   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3457   PetscFunctionReturn(PETSC_SUCCESS);
3458 }
3459 
3460 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3461 {
3462   PetscFunctionBegin;
3463   if (A->factortype == MAT_FACTOR_NONE) {
3464     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3465   } else {
3466     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3467   }
3468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3470   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3472   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3473   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3474   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3475   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3476   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3477   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3478   PetscCall(MatDestroy_SeqAIJ(A));
3479   PetscFunctionReturn(PETSC_SUCCESS);
3480 }
3481 
3482 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3483 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3484 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3485 {
3486   PetscFunctionBegin;
3487   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3488   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3489   PetscFunctionReturn(PETSC_SUCCESS);
3490 }
3491 
3492 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3493 {
3494   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3495   Mat_SeqAIJCUSPARSE *cy;
3496   Mat_SeqAIJCUSPARSE *cx;
3497   PetscScalar        *ay;
3498   const PetscScalar  *ax;
3499   CsrMatrix          *csry, *csrx;
3500 
3501   PetscFunctionBegin;
3502   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3503   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3504   if (X->ops->axpy != Y->ops->axpy) {
3505     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3506     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3507     PetscFunctionReturn(PETSC_SUCCESS);
3508   }
3509   /* if we are here, it means both matrices are bound to GPU */
3510   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3511   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3512   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3513   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3514   csry = (CsrMatrix *)cy->mat->mat;
3515   csrx = (CsrMatrix *)cx->mat->mat;
3516   /* see if we can turn this into a cublas axpy */
3517   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3518     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3519     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3520     if (eq) str = SAME_NONZERO_PATTERN;
3521   }
3522   /* spgeam is buggy with one column */
3523   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3524 
3525   if (str == SUBSET_NONZERO_PATTERN) {
3526     PetscScalar b = 1.0;
3527 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3528     size_t bufferSize;
3529     void  *buffer;
3530 #endif
3531 
3532     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3533     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3534     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3535 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3536     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3537                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3538     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3539     PetscCall(PetscLogGpuTimeBegin());
3540     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3541                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3542     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3543     PetscCall(PetscLogGpuTimeEnd());
3544     PetscCallCUDA(cudaFree(buffer));
3545 #else
3546     PetscCall(PetscLogGpuTimeBegin());
3547     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3548                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3549     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3550     PetscCall(PetscLogGpuTimeEnd());
3551 #endif
3552     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3553     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3554     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3555     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3556   } else if (str == SAME_NONZERO_PATTERN) {
3557     cublasHandle_t cublasv2handle;
3558     PetscBLASInt   one = 1, bnz = 1;
3559 
3560     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3561     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3562     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3563     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3564     PetscCall(PetscLogGpuTimeBegin());
3565     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3566     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3567     PetscCall(PetscLogGpuTimeEnd());
3568     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3569     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3570     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3571   } else {
3572     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3573     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3574   }
3575   PetscFunctionReturn(PETSC_SUCCESS);
3576 }
3577 
3578 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3579 {
3580   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3581   PetscScalar   *ay;
3582   cublasHandle_t cublasv2handle;
3583   PetscBLASInt   one = 1, bnz = 1;
3584 
3585   PetscFunctionBegin;
3586   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3587   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3588   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3589   PetscCall(PetscLogGpuTimeBegin());
3590   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3591   PetscCall(PetscLogGpuFlops(bnz));
3592   PetscCall(PetscLogGpuTimeEnd());
3593   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3594   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3595   PetscFunctionReturn(PETSC_SUCCESS);
3596 }
3597 
3598 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3599 {
3600   PetscBool   both = PETSC_FALSE;
3601   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3602 
3603   PetscFunctionBegin;
3604   if (A->factortype == MAT_FACTOR_NONE) {
3605     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3606     if (spptr->mat) {
3607       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3608       if (matrix->values) {
3609         both = PETSC_TRUE;
3610         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3611       }
3612     }
3613     if (spptr->matTranspose) {
3614       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3615       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3616     }
3617   }
3618   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3619   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3620   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3621   else A->offloadmask = PETSC_OFFLOAD_CPU;
3622   PetscFunctionReturn(PETSC_SUCCESS);
3623 }
3624 
3625 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3626 {
3627   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3628 
3629   PetscFunctionBegin;
3630   if (A->factortype != MAT_FACTOR_NONE) {
3631     A->boundtocpu = flg;
3632     PetscFunctionReturn(PETSC_SUCCESS);
3633   }
3634   if (flg) {
3635     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3636 
3637     A->ops->scale                     = MatScale_SeqAIJ;
3638     A->ops->axpy                      = MatAXPY_SeqAIJ;
3639     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3640     A->ops->mult                      = MatMult_SeqAIJ;
3641     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3642     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3643     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3644     A->ops->multhermitiantranspose    = NULL;
3645     A->ops->multhermitiantransposeadd = NULL;
3646     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3647     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3648     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3649     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3650     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3651     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3652     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3653     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3654   } else {
3655     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3656     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3657     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3658     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3659     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3660     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3661     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3662     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3663     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3664     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3665     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3666     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3667     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3668     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3669     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3670     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3671     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3672 
3673     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3674     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3675     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3676     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3677     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3678     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3679   }
3680   A->boundtocpu = flg;
3681   if (flg && a->inode.size) {
3682     a->inode.use = PETSC_TRUE;
3683   } else {
3684     a->inode.use = PETSC_FALSE;
3685   }
3686   PetscFunctionReturn(PETSC_SUCCESS);
3687 }
3688 
3689 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3690 {
3691   Mat B;
3692 
3693   PetscFunctionBegin;
3694   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3695   if (reuse == MAT_INITIAL_MATRIX) {
3696     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3697   } else if (reuse == MAT_REUSE_MATRIX) {
3698     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3699   }
3700   B = *newmat;
3701 
3702   PetscCall(PetscFree(B->defaultvectype));
3703   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3704 
3705   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3706     if (B->factortype == MAT_FACTOR_NONE) {
3707       Mat_SeqAIJCUSPARSE *spptr;
3708       PetscCall(PetscNew(&spptr));
3709       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3710       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3711       spptr->format = MAT_CUSPARSE_CSR;
3712 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3713   #if CUSPARSE_VERSION > 11301
3714       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3715   #else
3716       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3717   #endif
3718       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3719       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3720 #endif
3721       B->spptr = spptr;
3722     } else {
3723       Mat_SeqAIJCUSPARSETriFactors *spptr;
3724 
3725       PetscCall(PetscNew(&spptr));
3726       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3727       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3728       B->spptr = spptr;
3729     }
3730     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3731   }
3732   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3733   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3734   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3735   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3736   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3737   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3738 
3739   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3740   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3741   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3742 #if defined(PETSC_HAVE_HYPRE)
3743   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3744 #endif
3745   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3746   PetscFunctionReturn(PETSC_SUCCESS);
3747 }
3748 
3749 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3750 {
3751   PetscFunctionBegin;
3752   PetscCall(MatCreate_SeqAIJ(B));
3753   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3754   PetscFunctionReturn(PETSC_SUCCESS);
3755 }
3756 
3757 /*MC
3758    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3759 
3760    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3761    CSR, ELL, or Hybrid format.
3762    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3763 
3764    Options Database Keys:
3765 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3766 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3767                                       Other options include ell (ellpack) or hyb (hybrid).
3768 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3769 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3770 
3771   Level: beginner
3772 
3773 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3774 M*/
3775 
3776 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3777 
3778 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3779 {
3780   PetscFunctionBegin;
3781   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3782   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3783   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3784   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3785   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3786 
3787   PetscFunctionReturn(PETSC_SUCCESS);
3788 }
3789 
3790 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3791 {
3792   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3793 
3794   PetscFunctionBegin;
3795   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3796   delete cusp->cooPerm;
3797   delete cusp->cooPerm_a;
3798   cusp->cooPerm   = NULL;
3799   cusp->cooPerm_a = NULL;
3800   if (cusp->use_extended_coo) {
3801     PetscCallCUDA(cudaFree(cusp->jmap_d));
3802     PetscCallCUDA(cudaFree(cusp->perm_d));
3803   }
3804   cusp->use_extended_coo = PETSC_FALSE;
3805   PetscFunctionReturn(PETSC_SUCCESS);
3806 }
3807 
3808 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3809 {
3810   PetscFunctionBegin;
3811   if (*cusparsestruct) {
3812     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3813     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3814     delete (*cusparsestruct)->workVector;
3815     delete (*cusparsestruct)->rowoffsets_gpu;
3816     delete (*cusparsestruct)->cooPerm;
3817     delete (*cusparsestruct)->cooPerm_a;
3818     delete (*cusparsestruct)->csr2csc_i;
3819     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3820     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3821     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3822     PetscCall(PetscFree(*cusparsestruct));
3823   }
3824   PetscFunctionReturn(PETSC_SUCCESS);
3825 }
3826 
3827 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3828 {
3829   PetscFunctionBegin;
3830   if (*mat) {
3831     delete (*mat)->values;
3832     delete (*mat)->column_indices;
3833     delete (*mat)->row_offsets;
3834     delete *mat;
3835     *mat = 0;
3836   }
3837   PetscFunctionReturn(PETSC_SUCCESS);
3838 }
3839 
3840 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3841 {
3842   PetscFunctionBegin;
3843   if (*trifactor) {
3844     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3845     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3846     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3847     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3848     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3849 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3850     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3851 #endif
3852     PetscCall(PetscFree(*trifactor));
3853   }
3854   PetscFunctionReturn(PETSC_SUCCESS);
3855 }
3856 
3857 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3858 {
3859   CsrMatrix *mat;
3860 
3861   PetscFunctionBegin;
3862   if (*matstruct) {
3863     if ((*matstruct)->mat) {
3864       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3865 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3866         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3867 #else
3868         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3869         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3870 #endif
3871       } else {
3872         mat = (CsrMatrix *)(*matstruct)->mat;
3873         PetscCall(CsrMatrix_Destroy(&mat));
3874       }
3875     }
3876     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3877     delete (*matstruct)->cprowIndices;
3878     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3879     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3880     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3881 
3882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3883     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3884     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3885     for (int i = 0; i < 3; i++) {
3886       if (mdata->cuSpMV[i].initialized) {
3887         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3888         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3889         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3890       }
3891     }
3892 #endif
3893     delete *matstruct;
3894     *matstruct = NULL;
3895   }
3896   PetscFunctionReturn(PETSC_SUCCESS);
3897 }
3898 
3899 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3900 {
3901   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3902 
3903   PetscFunctionBegin;
3904   if (fs) {
3905     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3906     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3907     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3908     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3909     delete fs->rpermIndices;
3910     delete fs->cpermIndices;
3911     delete fs->workVector;
3912     fs->rpermIndices = NULL;
3913     fs->cpermIndices = NULL;
3914     fs->workVector   = NULL;
3915     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3916     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3917     fs->init_dev_prop = PETSC_FALSE;
3918 #if CUSPARSE_VERSION >= 11500
3919     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3920     PetscCallCUDA(cudaFree(fs->csrColIdx));
3921     PetscCallCUDA(cudaFree(fs->csrVal));
3922     PetscCallCUDA(cudaFree(fs->X));
3923     PetscCallCUDA(cudaFree(fs->Y));
3924     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3925     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3926     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3927     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3928     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3929     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3930     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3931     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3932     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3933     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3934     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3935     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3936     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3937     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3938     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3939     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3940 
3941     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3942     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3943 #endif
3944   }
3945   PetscFunctionReturn(PETSC_SUCCESS);
3946 }
3947 
3948 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3949 {
3950   PetscFunctionBegin;
3951   if (*trifactors) {
3952     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3953     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3954     PetscCall(PetscFree(*trifactors));
3955   }
3956   PetscFunctionReturn(PETSC_SUCCESS);
3957 }
3958 
3959 struct IJCompare {
3960   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3961   {
3962     if (t1.get<0>() < t2.get<0>()) return true;
3963     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3964     return false;
3965   }
3966 };
3967 
3968 struct IJEqual {
3969   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3970   {
3971     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3972     return true;
3973   }
3974 };
3975 
3976 struct IJDiff {
3977   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3978 };
3979 
3980 struct IJSum {
3981   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3982 };
3983 
3984 #include <thrust/iterator/discard_iterator.h>
3985 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3986 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3987 {
3988   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3989   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3990   THRUSTARRAY                          *cooPerm_v = NULL;
3991   thrust::device_ptr<const PetscScalar> d_v;
3992   CsrMatrix                            *matrix;
3993   PetscInt                              n;
3994 
3995   PetscFunctionBegin;
3996   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
3997   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
3998   if (!cusp->cooPerm) {
3999     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4000     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4001     PetscFunctionReturn(PETSC_SUCCESS);
4002   }
4003   matrix = (CsrMatrix *)cusp->mat->mat;
4004   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4005   if (!v) {
4006     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4007     goto finalize;
4008   }
4009   n = cusp->cooPerm->size();
4010   if (isCudaMem(v)) {
4011     d_v = thrust::device_pointer_cast(v);
4012   } else {
4013     cooPerm_v = new THRUSTARRAY(n);
4014     cooPerm_v->assign(v, v + n);
4015     d_v = cooPerm_v->data();
4016     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4017   }
4018   PetscCall(PetscLogGpuTimeBegin());
4019   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4020     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4021       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4022       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4023       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4024         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4025         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4026       */
4027       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4028       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4029       delete cooPerm_w;
4030     } else {
4031       /* all nonzeros in d_v[] are unique entries */
4032       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4033       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4034       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4035     }
4036   } else {
4037     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4038       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4039       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4040     } else {
4041       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4042       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4043       thrust::for_each(zibit, zieit, VecCUDAEquals());
4044     }
4045   }
4046   PetscCall(PetscLogGpuTimeEnd());
4047 finalize:
4048   delete cooPerm_v;
4049   A->offloadmask = PETSC_OFFLOAD_GPU;
4050   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4051   /* shorter version of MatAssemblyEnd_SeqAIJ */
4052   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4053   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4054   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4055   a->reallocs = 0;
4056   A->info.mallocs += 0;
4057   A->info.nz_unneeded = 0;
4058   A->assembled = A->was_assembled = PETSC_TRUE;
4059   A->num_ass++;
4060   PetscFunctionReturn(PETSC_SUCCESS);
4061 }
4062 
4063 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4064 {
4065   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4066 
4067   PetscFunctionBegin;
4068   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4069   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4070   if (destroy) {
4071     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4072     delete cusp->csr2csc_i;
4073     cusp->csr2csc_i = NULL;
4074   }
4075   A->transupdated = PETSC_FALSE;
4076   PetscFunctionReturn(PETSC_SUCCESS);
4077 }
4078 
4079 #include <thrust/binary_search.h>
4080 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4081 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4082 {
4083   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4084   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4085   PetscInt            cooPerm_n, nzr = 0;
4086 
4087   PetscFunctionBegin;
4088   PetscCall(PetscLayoutSetUp(A->rmap));
4089   PetscCall(PetscLayoutSetUp(A->cmap));
4090   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4091   if (n != cooPerm_n) {
4092     delete cusp->cooPerm;
4093     delete cusp->cooPerm_a;
4094     cusp->cooPerm   = NULL;
4095     cusp->cooPerm_a = NULL;
4096   }
4097   if (n) {
4098     thrust::device_ptr<PetscInt> d_i, d_j;
4099     PetscInt                    *d_raw_i, *d_raw_j;
4100     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4101     PetscMemType                 imtype, jmtype;
4102 
4103     PetscCall(PetscGetMemType(coo_i, &imtype));
4104     if (PetscMemTypeHost(imtype)) {
4105       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4106       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4107       d_i        = thrust::device_pointer_cast(d_raw_i);
4108       free_raw_i = PETSC_TRUE;
4109       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4110     } else {
4111       d_i = thrust::device_pointer_cast(coo_i);
4112     }
4113 
4114     PetscCall(PetscGetMemType(coo_j, &jmtype));
4115     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4116       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4117       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4118       d_j        = thrust::device_pointer_cast(d_raw_j);
4119       free_raw_j = PETSC_TRUE;
4120       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4121     } else {
4122       d_j = thrust::device_pointer_cast(coo_j);
4123     }
4124 
4125     THRUSTINTARRAY ii(A->rmap->n);
4126 
4127     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4128     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4129 
4130     /* Ex.
4131       n = 6
4132       coo_i = [3,3,1,4,1,4]
4133       coo_j = [3,2,2,5,2,6]
4134     */
4135     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4136     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4137 
4138     PetscCall(PetscLogGpuTimeBegin());
4139     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4140     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4141     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4142     THRUSTINTARRAY w(d_j, d_j + n);
4143 
4144     /*
4145       d_i     = [1,1,3,3,4,4]
4146       d_j     = [2,2,2,3,5,6]
4147       cooPerm = [2,4,1,0,3,5]
4148     */
4149     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4150 
4151     /*
4152       d_i     = [1,3,3,4,4,x]
4153                             ^ekey
4154       d_j     = [2,2,3,5,6,x]
4155                            ^nekye
4156     */
4157     if (nekey == ekey) { /* all entries are unique */
4158       delete cusp->cooPerm_a;
4159       cusp->cooPerm_a = NULL;
4160     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4161       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4162       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4163       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4164       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4165       w[0]                  = 0;
4166       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4167       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4168     }
4169     thrust::counting_iterator<PetscInt> search_begin(0);
4170     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4171                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4172                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4173     PetscCall(PetscLogGpuTimeEnd());
4174 
4175     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4176     a->singlemalloc = PETSC_FALSE;
4177     a->free_a       = PETSC_TRUE;
4178     a->free_ij      = PETSC_TRUE;
4179     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4180     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4181     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4182     a->nz = a->maxnz = a->i[A->rmap->n];
4183     a->rmax          = 0;
4184     PetscCall(PetscMalloc1(a->nz, &a->a));
4185     PetscCall(PetscMalloc1(a->nz, &a->j));
4186     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4187     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4188     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4189     for (PetscInt i = 0; i < A->rmap->n; i++) {
4190       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4191       nzr += (PetscInt) !!(nnzr);
4192       a->ilen[i] = a->imax[i] = nnzr;
4193       a->rmax                 = PetscMax(a->rmax, nnzr);
4194     }
4195     a->nonzerorowcnt = nzr;
4196     A->preallocated  = PETSC_TRUE;
4197     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4198     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4199     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4200     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4201   } else {
4202     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4203   }
4204   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4205 
4206   /* We want to allocate the CUSPARSE struct for matvec now.
4207      The code is so convoluted now that I prefer to copy zeros */
4208   PetscCall(PetscArrayzero(a->a, a->nz));
4209   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4210   A->offloadmask = PETSC_OFFLOAD_CPU;
4211   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4212   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4213   PetscFunctionReturn(PETSC_SUCCESS);
4214 }
4215 
4216 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4217 {
4218   Mat_SeqAIJ         *seq;
4219   Mat_SeqAIJCUSPARSE *dev;
4220   PetscBool           coo_basic = PETSC_TRUE;
4221   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4222 
4223   PetscFunctionBegin;
4224   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4225   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4226   if (coo_i) {
4227     PetscCall(PetscGetMemType(coo_i, &mtype));
4228     if (PetscMemTypeHost(mtype)) {
4229       for (PetscCount k = 0; k < coo_n; k++) {
4230         if (coo_i[k] < 0 || coo_j[k] < 0) {
4231           coo_basic = PETSC_FALSE;
4232           break;
4233         }
4234       }
4235     }
4236   }
4237 
4238   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4239     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4240   } else {
4241     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4242     mat->offloadmask = PETSC_OFFLOAD_CPU;
4243     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4244     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4245     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4246     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4247     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4248     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4249     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4250     dev->use_extended_coo = PETSC_TRUE;
4251   }
4252   PetscFunctionReturn(PETSC_SUCCESS);
4253 }
4254 
4255 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4256 {
4257   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4258   const PetscCount grid_size = gridDim.x * blockDim.x;
4259   for (; i < nnz; i += grid_size) {
4260     PetscScalar sum = 0.0;
4261     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4262     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4263   }
4264 }
4265 
4266 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4267 {
4268   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4269   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4270   PetscCount          Annz = seq->nz;
4271   PetscMemType        memtype;
4272   const PetscScalar  *v1 = v;
4273   PetscScalar        *Aa;
4274 
4275   PetscFunctionBegin;
4276   if (dev->use_extended_coo) {
4277     PetscCall(PetscGetMemType(v, &memtype));
4278     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4279       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4280       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4281     }
4282 
4283     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4284     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4285 
4286     if (Annz) {
4287       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4288       PetscCallCUDA(cudaPeekAtLastError());
4289     }
4290 
4291     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4292     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4293 
4294     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4295   } else {
4296     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4297   }
4298   PetscFunctionReturn(PETSC_SUCCESS);
4299 }
4300 
4301 /*@C
4302     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4303 
4304    Not Collective
4305 
4306     Input Parameters:
4307 +   A - the matrix
4308 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4309 
4310     Output Parameters:
4311 +   i - the CSR row pointers
4312 -   j - the CSR column indices
4313 
4314     Level: developer
4315 
4316     Note:
4317       When compressed is true, the CSR structure does not contain empty rows
4318 
4319 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4320 @*/
4321 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4322 {
4323   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4324   CsrMatrix          *csr;
4325   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4326 
4327   PetscFunctionBegin;
4328   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4329   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4330   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4331   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4332   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4333   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4334   csr = (CsrMatrix *)cusp->mat->mat;
4335   if (i) {
4336     if (!compressed && a->compressedrow.use) { /* need full row offset */
4337       if (!cusp->rowoffsets_gpu) {
4338         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4339         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4340         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4341       }
4342       *i = cusp->rowoffsets_gpu->data().get();
4343     } else *i = csr->row_offsets->data().get();
4344   }
4345   if (j) *j = csr->column_indices->data().get();
4346   PetscFunctionReturn(PETSC_SUCCESS);
4347 }
4348 
4349 /*@C
4350     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4351 
4352    Not Collective
4353 
4354     Input Parameters:
4355 +   A - the matrix
4356 .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4357 .   i - the CSR row pointers
4358 -   j - the CSR column indices
4359 
4360     Level: developer
4361 
4362 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4363 @*/
4364 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4365 {
4366   PetscFunctionBegin;
4367   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4368   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4369   if (i) *i = NULL;
4370   if (j) *j = NULL;
4371   (void)compressed;
4372   PetscFunctionReturn(PETSC_SUCCESS);
4373 }
4374 
4375 /*@C
4376    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4377 
4378    Not Collective
4379 
4380    Input Parameter:
4381 .   A - a `MATSEQAIJCUSPARSE` matrix
4382 
4383    Output Parameter:
4384 .   a - pointer to the device data
4385 
4386    Level: developer
4387 
4388    Note:
4389    May trigger host-device copies if up-to-date matrix data is on host
4390 
4391 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4392 @*/
4393 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4394 {
4395   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4396   CsrMatrix          *csr;
4397 
4398   PetscFunctionBegin;
4399   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4400   PetscValidPointer(a, 2);
4401   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4402   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4403   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4404   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4405   csr = (CsrMatrix *)cusp->mat->mat;
4406   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4407   *a = csr->values->data().get();
4408   PetscFunctionReturn(PETSC_SUCCESS);
4409 }
4410 
4411 /*@C
4412    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4413 
4414    Not Collective
4415 
4416    Input Parameters:
4417 +   A - a `MATSEQAIJCUSPARSE` matrix
4418 -   a - pointer to the device data
4419 
4420    Level: developer
4421 
4422 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4423 @*/
4424 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4425 {
4426   PetscFunctionBegin;
4427   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4428   PetscValidPointer(a, 2);
4429   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4430   *a = NULL;
4431   PetscFunctionReturn(PETSC_SUCCESS);
4432 }
4433 
4434 /*@C
4435    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4436 
4437    Not Collective
4438 
4439    Input Parameter:
4440 .   A - a `MATSEQAIJCUSPARSE` matrix
4441 
4442    Output Parameter:
4443 .   a - pointer to the device data
4444 
4445    Level: developer
4446 
4447    Note:
4448    May trigger host-device copies if up-to-date matrix data is on host
4449 
4450 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4451 @*/
4452 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4453 {
4454   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4455   CsrMatrix          *csr;
4456 
4457   PetscFunctionBegin;
4458   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4459   PetscValidPointer(a, 2);
4460   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4461   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4462   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4463   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4464   csr = (CsrMatrix *)cusp->mat->mat;
4465   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4466   *a             = csr->values->data().get();
4467   A->offloadmask = PETSC_OFFLOAD_GPU;
4468   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4469   PetscFunctionReturn(PETSC_SUCCESS);
4470 }
4471 /*@C
4472    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4473 
4474    Not Collective
4475 
4476    Input Parameters:
4477 +   A - a `MATSEQAIJCUSPARSE` matrix
4478 -   a - pointer to the device data
4479 
4480    Level: developer
4481 
4482 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4483 @*/
4484 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4485 {
4486   PetscFunctionBegin;
4487   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4488   PetscValidPointer(a, 2);
4489   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4490   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4491   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4492   *a = NULL;
4493   PetscFunctionReturn(PETSC_SUCCESS);
4494 }
4495 
4496 /*@C
4497    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4498 
4499    Not Collective
4500 
4501    Input Parameter:
4502 .   A - a `MATSEQAIJCUSPARSE` matrix
4503 
4504    Output Parameter:
4505 .   a - pointer to the device data
4506 
4507    Level: developer
4508 
4509    Note:
4510    Does not trigger host-device copies and flags data validity on the GPU
4511 
4512 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4513 @*/
4514 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4515 {
4516   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4517   CsrMatrix          *csr;
4518 
4519   PetscFunctionBegin;
4520   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4521   PetscValidPointer(a, 2);
4522   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4523   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4524   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4525   csr = (CsrMatrix *)cusp->mat->mat;
4526   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4527   *a             = csr->values->data().get();
4528   A->offloadmask = PETSC_OFFLOAD_GPU;
4529   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4530   PetscFunctionReturn(PETSC_SUCCESS);
4531 }
4532 
4533 /*@C
4534    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4535 
4536    Not Collective
4537 
4538    Input Parameters:
4539 +   A - a `MATSEQAIJCUSPARSE` matrix
4540 -   a - pointer to the device data
4541 
4542    Level: developer
4543 
4544 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4545 @*/
4546 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4547 {
4548   PetscFunctionBegin;
4549   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4550   PetscValidPointer(a, 2);
4551   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4552   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4553   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4554   *a = NULL;
4555   PetscFunctionReturn(PETSC_SUCCESS);
4556 }
4557 
4558 struct IJCompare4 {
4559   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4560   {
4561     if (t1.get<0>() < t2.get<0>()) return true;
4562     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4563     return false;
4564   }
4565 };
4566 
4567 struct Shift {
4568   int _shift;
4569 
4570   Shift(int shift) : _shift(shift) { }
4571   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4572 };
4573 
4574 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4575 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4576 {
4577   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4578   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4579   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4580   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4581   PetscInt                      Annz, Bnnz;
4582   cusparseStatus_t              stat;
4583   PetscInt                      i, m, n, zero = 0;
4584 
4585   PetscFunctionBegin;
4586   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4587   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4588   PetscValidPointer(C, 4);
4589   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4591   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4592   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4593   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4594   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4595   if (reuse == MAT_INITIAL_MATRIX) {
4596     m = A->rmap->n;
4597     n = A->cmap->n + B->cmap->n;
4598     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4599     PetscCall(MatSetSizes(*C, m, n, m, n));
4600     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4601     c                       = (Mat_SeqAIJ *)(*C)->data;
4602     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4603     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4604     Ccsr                    = new CsrMatrix;
4605     Cmat->cprowIndices      = NULL;
4606     c->compressedrow.use    = PETSC_FALSE;
4607     c->compressedrow.nrows  = 0;
4608     c->compressedrow.i      = NULL;
4609     c->compressedrow.rindex = NULL;
4610     Ccusp->workVector       = NULL;
4611     Ccusp->nrows            = m;
4612     Ccusp->mat              = Cmat;
4613     Ccusp->mat->mat         = Ccsr;
4614     Ccsr->num_rows          = m;
4615     Ccsr->num_cols          = n;
4616     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4617     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4618     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4619     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4620     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4621     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4622     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4623     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4624     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4625     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4626     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4627     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4628     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4629 
4630     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4631     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4632     Annz                 = (PetscInt)Acsr->column_indices->size();
4633     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4634     c->nz                = Annz + Bnnz;
4635     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4636     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4637     Ccsr->values         = new THRUSTARRAY(c->nz);
4638     Ccsr->num_entries    = c->nz;
4639     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4640     if (c->nz) {
4641       auto              Acoo = new THRUSTINTARRAY32(Annz);
4642       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4643       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4644       THRUSTINTARRAY32 *Aroff, *Broff;
4645 
4646       if (a->compressedrow.use) { /* need full row offset */
4647         if (!Acusp->rowoffsets_gpu) {
4648           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4649           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4650           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4651         }
4652         Aroff = Acusp->rowoffsets_gpu;
4653       } else Aroff = Acsr->row_offsets;
4654       if (b->compressedrow.use) { /* need full row offset */
4655         if (!Bcusp->rowoffsets_gpu) {
4656           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4657           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4658           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4659         }
4660         Broff = Bcusp->rowoffsets_gpu;
4661       } else Broff = Bcsr->row_offsets;
4662       PetscCall(PetscLogGpuTimeBegin());
4663       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4664       PetscCallCUSPARSE(stat);
4665       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4666       PetscCallCUSPARSE(stat);
4667       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4668       auto Aperm = thrust::make_constant_iterator(1);
4669       auto Bperm = thrust::make_constant_iterator(0);
4670 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4671       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4672       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4673 #else
4674       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4675       auto Bcib = Bcsr->column_indices->begin();
4676       auto Bcie = Bcsr->column_indices->end();
4677       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4678 #endif
4679       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4680       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4681       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4682       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4683       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4684       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4685       auto p1    = Ccusp->cooPerm->begin();
4686       auto p2    = Ccusp->cooPerm->begin();
4687       thrust::advance(p2, Annz);
4688       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4689 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4690       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4691 #endif
4692       auto cci = thrust::make_counting_iterator(zero);
4693       auto cce = thrust::make_counting_iterator(c->nz);
4694 #if 0 //Errors on SUMMIT cuda 11.1.0
4695       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4696 #else
4697       auto pred = thrust::identity<int>();
4698       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4699       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4700 #endif
4701       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4702       PetscCallCUSPARSE(stat);
4703       PetscCall(PetscLogGpuTimeEnd());
4704       delete wPerm;
4705       delete Acoo;
4706       delete Bcoo;
4707       delete Ccoo;
4708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4709       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4710       PetscCallCUSPARSE(stat);
4711 #endif
4712       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4713         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4714         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4715         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4716         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4717         CsrMatrix                    *CcsrT = new CsrMatrix;
4718         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4719         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4720 
4721         (*C)->form_explicit_transpose = PETSC_TRUE;
4722         (*C)->transupdated            = PETSC_TRUE;
4723         Ccusp->rowoffsets_gpu         = NULL;
4724         CmatT->cprowIndices           = NULL;
4725         CmatT->mat                    = CcsrT;
4726         CcsrT->num_rows               = n;
4727         CcsrT->num_cols               = m;
4728         CcsrT->num_entries            = c->nz;
4729 
4730         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4731         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4732         CcsrT->values         = new THRUSTARRAY(c->nz);
4733 
4734         PetscCall(PetscLogGpuTimeBegin());
4735         auto rT = CcsrT->row_offsets->begin();
4736         if (AT) {
4737           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4738           thrust::advance(rT, -1);
4739         }
4740         if (BT) {
4741           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4742           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4743           thrust::copy(titb, tite, rT);
4744         }
4745         auto cT = CcsrT->column_indices->begin();
4746         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4747         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4748         auto vT = CcsrT->values->begin();
4749         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4750         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4751         PetscCall(PetscLogGpuTimeEnd());
4752 
4753         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4754         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4755         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4756         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4757         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4758         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4759         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4760         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4761         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4763         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4764         PetscCallCUSPARSE(stat);
4765 #endif
4766         Ccusp->matTranspose = CmatT;
4767       }
4768     }
4769 
4770     c->singlemalloc = PETSC_FALSE;
4771     c->free_a       = PETSC_TRUE;
4772     c->free_ij      = PETSC_TRUE;
4773     PetscCall(PetscMalloc1(m + 1, &c->i));
4774     PetscCall(PetscMalloc1(c->nz, &c->j));
4775     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4776       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4777       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4778       ii = *Ccsr->row_offsets;
4779       jj = *Ccsr->column_indices;
4780       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4781       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4782     } else {
4783       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4784       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4785     }
4786     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4787     PetscCall(PetscMalloc1(m, &c->ilen));
4788     PetscCall(PetscMalloc1(m, &c->imax));
4789     c->maxnz         = c->nz;
4790     c->nonzerorowcnt = 0;
4791     c->rmax          = 0;
4792     for (i = 0; i < m; i++) {
4793       const PetscInt nn = c->i[i + 1] - c->i[i];
4794       c->ilen[i] = c->imax[i] = nn;
4795       c->nonzerorowcnt += (PetscInt) !!nn;
4796       c->rmax = PetscMax(c->rmax, nn);
4797     }
4798     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4799     PetscCall(PetscMalloc1(c->nz, &c->a));
4800     (*C)->nonzerostate++;
4801     PetscCall(PetscLayoutSetUp((*C)->rmap));
4802     PetscCall(PetscLayoutSetUp((*C)->cmap));
4803     Ccusp->nonzerostate = (*C)->nonzerostate;
4804     (*C)->preallocated  = PETSC_TRUE;
4805   } else {
4806     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4807     c = (Mat_SeqAIJ *)(*C)->data;
4808     if (c->nz) {
4809       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4810       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4811       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4812       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4813       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4814       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4815       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4816       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4817       Acsr = (CsrMatrix *)Acusp->mat->mat;
4818       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4819       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4820       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4821       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4822       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4823       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4824       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4825       auto pmid = Ccusp->cooPerm->begin();
4826       thrust::advance(pmid, Acsr->num_entries);
4827       PetscCall(PetscLogGpuTimeBegin());
4828       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4829       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4830       thrust::for_each(zibait, zieait, VecCUDAEquals());
4831       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4832       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4833       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4834       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4835       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4836         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4837         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4838         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4839         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4840         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4841         auto       vT    = CcsrT->values->begin();
4842         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4843         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4844         (*C)->transupdated = PETSC_TRUE;
4845       }
4846       PetscCall(PetscLogGpuTimeEnd());
4847     }
4848   }
4849   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4850   (*C)->assembled     = PETSC_TRUE;
4851   (*C)->was_assembled = PETSC_FALSE;
4852   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4853   PetscFunctionReturn(PETSC_SUCCESS);
4854 }
4855 
4856 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4857 {
4858   bool               dmem;
4859   const PetscScalar *av;
4860 
4861   PetscFunctionBegin;
4862   dmem = isCudaMem(v);
4863   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4864   if (n && idx) {
4865     THRUSTINTARRAY widx(n);
4866     widx.assign(idx, idx + n);
4867     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4868 
4869     THRUSTARRAY                    *w = NULL;
4870     thrust::device_ptr<PetscScalar> dv;
4871     if (dmem) {
4872       dv = thrust::device_pointer_cast(v);
4873     } else {
4874       w  = new THRUSTARRAY(n);
4875       dv = w->data();
4876     }
4877     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4878 
4879     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4880     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4881     thrust::for_each(zibit, zieit, VecCUDAEquals());
4882     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4883     delete w;
4884   } else {
4885     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4886   }
4887   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4888   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4889   PetscFunctionReturn(PETSC_SUCCESS);
4890 }
4891