xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 21e3ffae2f3b73c0bd738cf6d0a809700fc04bb0)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
69 
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
90 
91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
93 
94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97 
98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99 {
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
101 
102   PetscFunctionBegin;
103   switch (op) {
104   case MAT_CUSPARSE_MULT:
105     cusparsestruct->format = format;
106     break;
107   case MAT_CUSPARSE_ALL:
108     cusparsestruct->format = format;
109     break;
110   default:
111     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112   }
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 /*@
117    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118    operation. Only the `MatMult()` operation can use different GPU storage formats
119 
120    Not Collective
121 
122    Input Parameters:
123 +  A - Matrix of type `MATSEQAIJCUSPARSE`
124 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
125         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127 
128    Output Parameter:
129 
130    Level: intermediate
131 
132 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
133 @*/
134 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
135 {
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
138   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
139   PetscFunctionReturn(PETSC_SUCCESS);
140 }
141 
142 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
143 {
144   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
145 
146   PetscFunctionBegin;
147   cusparsestruct->use_cpu_solve = use_cpu;
148   PetscFunctionReturn(PETSC_SUCCESS);
149 }
150 
151 /*@
152    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
153 
154    Input Parameters:
155 +  A - Matrix of type `MATSEQAIJCUSPARSE`
156 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
157 
158    Output Parameter:
159 
160    Note:
161    The cuSparse LU solver currently computes the factors with the built-in CPU method
162    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
163    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
164 
165    Level: intermediate
166 
167 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
168 @*/
169 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
170 {
171   PetscFunctionBegin;
172   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
173   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
174   PetscFunctionReturn(PETSC_SUCCESS);
175 }
176 
177 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
178 {
179   PetscFunctionBegin;
180   switch (op) {
181   case MAT_FORM_EXPLICIT_TRANSPOSE:
182     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
183     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
184     A->form_explicit_transpose = flg;
185     break;
186   default:
187     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
188     break;
189   }
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
194 
195 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
196 {
197   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
198   IS                  isrow = b->row, iscol = b->col;
199   PetscBool           row_identity, col_identity;
200   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
201 
202   PetscFunctionBegin;
203   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
204   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
205   B->offloadmask = PETSC_OFFLOAD_CPU;
206   /* determine which version of MatSolve needs to be used. */
207   PetscCall(ISIdentity(isrow, &row_identity));
208   PetscCall(ISIdentity(iscol, &col_identity));
209 
210   if (!cusparsestruct->use_cpu_solve) {
211     if (row_identity && col_identity) {
212       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
213       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
214     } else {
215       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
216       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
217     }
218   }
219   B->ops->matsolve          = NULL;
220   B->ops->matsolvetranspose = NULL;
221 
222   /* get the triangular factors */
223   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
228 {
229   MatCUSPARSEStorageFormat format;
230   PetscBool                flg;
231   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
232 
233   PetscFunctionBegin;
234   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
235   if (A->factortype == MAT_FACTOR_NONE) {
236     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
237     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
238 
239     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
245     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
246   #if CUSPARSE_VERSION > 11301
247     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
248   #else
249     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
250   #endif
251     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
252     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
253 
254     PetscCall(
255       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
256     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
257 #endif
258   }
259   PetscOptionsHeadEnd();
260   PetscFunctionReturn(PETSC_SUCCESS);
261 }
262 
263 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
264 {
265   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
266   PetscInt                           n                  = A->rmap->n;
267   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
268   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
269   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
270   const MatScalar                   *aa = a->a, *v;
271   PetscInt                          *AiLo, *AjLo;
272   PetscInt                           i, nz, nzLower, offset, rowOffset;
273 
274   PetscFunctionBegin;
275   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
276   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
277     try {
278       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
279       nzLower = n + ai[n] - ai[1];
280       if (!loTriFactor) {
281         PetscScalar *AALo;
282 
283         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
284 
285         /* Allocate Space for the lower triangular matrix */
286         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
287         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
288 
289         /* Fill the lower triangular matrix */
290         AiLo[0]   = (PetscInt)0;
291         AiLo[n]   = nzLower;
292         AjLo[0]   = (PetscInt)0;
293         AALo[0]   = (MatScalar)1.0;
294         v         = aa;
295         vi        = aj;
296         offset    = 1;
297         rowOffset = 1;
298         for (i = 1; i < n; i++) {
299           nz = ai[i + 1] - ai[i];
300           /* additional 1 for the term on the diagonal */
301           AiLo[i] = rowOffset;
302           rowOffset += nz + 1;
303 
304           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
305           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
306 
307           offset += nz;
308           AjLo[offset] = (PetscInt)i;
309           AALo[offset] = (MatScalar)1.0;
310           offset += 1;
311 
312           v += nz;
313           vi += nz;
314         }
315 
316         /* allocate space for the triangular factor information */
317         PetscCall(PetscNew(&loTriFactor));
318         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
319         /* Create the matrix description */
320         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
321         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
322 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
323         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
324 #else
325         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
326 #endif
327         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
328         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
329 
330         /* set the operation */
331         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
332 
333         /* set the matrix */
334         loTriFactor->csrMat              = new CsrMatrix;
335         loTriFactor->csrMat->num_rows    = n;
336         loTriFactor->csrMat->num_cols    = n;
337         loTriFactor->csrMat->num_entries = nzLower;
338 
339         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
340         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
341 
342         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
343         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
344 
345         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
346         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
347 
348         /* Create the solve analysis information */
349         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
350         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
351 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
352         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
353                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
354         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
355 #endif
356 
357         /* perform the solve analysis */
358         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
359                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
360         PetscCallCUDA(WaitForCUDA());
361         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
362 
363         /* assign the pointer */
364         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
365         loTriFactor->AA_h                                          = AALo;
366         PetscCallCUDA(cudaFreeHost(AiLo));
367         PetscCallCUDA(cudaFreeHost(AjLo));
368         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
369       } else { /* update values only */
370         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
371         /* Fill the lower triangular matrix */
372         loTriFactor->AA_h[0] = 1.0;
373         v                    = aa;
374         vi                   = aj;
375         offset               = 1;
376         for (i = 1; i < n; i++) {
377           nz = ai[i + 1] - ai[i];
378           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
379           offset += nz;
380           loTriFactor->AA_h[offset] = 1.0;
381           offset += 1;
382           v += nz;
383         }
384         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
385         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
386       }
387     } catch (char *ex) {
388       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
389     }
390   }
391   PetscFunctionReturn(PETSC_SUCCESS);
392 }
393 
394 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
395 {
396   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
397   PetscInt                           n                  = A->rmap->n;
398   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
399   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
400   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
401   const MatScalar                   *aa = a->a, *v;
402   PetscInt                          *AiUp, *AjUp;
403   PetscInt                           i, nz, nzUpper, offset;
404 
405   PetscFunctionBegin;
406   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
407   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
408     try {
409       /* next, figure out the number of nonzeros in the upper triangular matrix. */
410       nzUpper = adiag[0] - adiag[n];
411       if (!upTriFactor) {
412         PetscScalar *AAUp;
413 
414         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
415 
416         /* Allocate Space for the upper triangular matrix */
417         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
418         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
419 
420         /* Fill the upper triangular matrix */
421         AiUp[0] = (PetscInt)0;
422         AiUp[n] = nzUpper;
423         offset  = nzUpper;
424         for (i = n - 1; i >= 0; i--) {
425           v  = aa + adiag[i + 1] + 1;
426           vi = aj + adiag[i + 1] + 1;
427 
428           /* number of elements NOT on the diagonal */
429           nz = adiag[i] - adiag[i + 1] - 1;
430 
431           /* decrement the offset */
432           offset -= (nz + 1);
433 
434           /* first, set the diagonal elements */
435           AjUp[offset] = (PetscInt)i;
436           AAUp[offset] = (MatScalar)1. / v[nz];
437           AiUp[i]      = AiUp[i + 1] - (nz + 1);
438 
439           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
440           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
441         }
442 
443         /* allocate space for the triangular factor information */
444         PetscCall(PetscNew(&upTriFactor));
445         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
446 
447         /* Create the matrix description */
448         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
449         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
450 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
451         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
452 #else
453         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
454 #endif
455         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
456         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
457 
458         /* set the operation */
459         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
460 
461         /* set the matrix */
462         upTriFactor->csrMat              = new CsrMatrix;
463         upTriFactor->csrMat->num_rows    = n;
464         upTriFactor->csrMat->num_cols    = n;
465         upTriFactor->csrMat->num_entries = nzUpper;
466 
467         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
468         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
469 
470         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
471         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
472 
473         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
474         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
475 
476         /* Create the solve analysis information */
477         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
478         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
479 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
480         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
481                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
482         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
483 #endif
484 
485         /* perform the solve analysis */
486         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
487                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
488 
489         PetscCallCUDA(WaitForCUDA());
490         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
491 
492         /* assign the pointer */
493         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
494         upTriFactor->AA_h                                          = AAUp;
495         PetscCallCUDA(cudaFreeHost(AiUp));
496         PetscCallCUDA(cudaFreeHost(AjUp));
497         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
498       } else {
499         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
500         /* Fill the upper triangular matrix */
501         offset = nzUpper;
502         for (i = n - 1; i >= 0; i--) {
503           v = aa + adiag[i + 1] + 1;
504 
505           /* number of elements NOT on the diagonal */
506           nz = adiag[i] - adiag[i + 1] - 1;
507 
508           /* decrement the offset */
509           offset -= (nz + 1);
510 
511           /* first, set the diagonal elements */
512           upTriFactor->AA_h[offset] = 1. / v[nz];
513           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
514         }
515         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
516         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
517       }
518     } catch (char *ex) {
519       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
520     }
521   }
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
526 {
527   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
528   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
529   IS                            isrow = a->row, iscol = a->icol;
530   PetscBool                     row_identity, col_identity;
531   PetscInt                      n = A->rmap->n;
532 
533   PetscFunctionBegin;
534   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
535   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
536   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
537 
538   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
539   cusparseTriFactors->nnz = a->nz;
540 
541   A->offloadmask = PETSC_OFFLOAD_BOTH;
542   /* lower triangular indices */
543   PetscCall(ISIdentity(isrow, &row_identity));
544   if (!row_identity && !cusparseTriFactors->rpermIndices) {
545     const PetscInt *r;
546 
547     PetscCall(ISGetIndices(isrow, &r));
548     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
549     cusparseTriFactors->rpermIndices->assign(r, r + n);
550     PetscCall(ISRestoreIndices(isrow, &r));
551     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
552   }
553 
554   /* upper triangular indices */
555   PetscCall(ISIdentity(iscol, &col_identity));
556   if (!col_identity && !cusparseTriFactors->cpermIndices) {
557     const PetscInt *c;
558 
559     PetscCall(ISGetIndices(iscol, &c));
560     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
561     cusparseTriFactors->cpermIndices->assign(c, c + n);
562     PetscCall(ISRestoreIndices(iscol, &c));
563     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
564   }
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
568 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
569 {
570   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
571   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
573   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
574   PetscInt                          *AiUp, *AjUp;
575   PetscScalar                       *AAUp;
576   PetscScalar                       *AALo;
577   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
578   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
579   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
580   const MatScalar                   *aa = b->a, *v;
581 
582   PetscFunctionBegin;
583   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
584   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
585     try {
586       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
587       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
588       if (!upTriFactor && !loTriFactor) {
589         /* Allocate Space for the upper triangular matrix */
590         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
591         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
592 
593         /* Fill the upper triangular matrix */
594         AiUp[0] = (PetscInt)0;
595         AiUp[n] = nzUpper;
596         offset  = 0;
597         for (i = 0; i < n; i++) {
598           /* set the pointers */
599           v  = aa + ai[i];
600           vj = aj + ai[i];
601           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
602 
603           /* first, set the diagonal elements */
604           AjUp[offset] = (PetscInt)i;
605           AAUp[offset] = (MatScalar)1.0 / v[nz];
606           AiUp[i]      = offset;
607           AALo[offset] = (MatScalar)1.0 / v[nz];
608 
609           offset += 1;
610           if (nz > 0) {
611             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
612             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
613             for (j = offset; j < offset + nz; j++) {
614               AAUp[j] = -AAUp[j];
615               AALo[j] = AAUp[j] / v[nz];
616             }
617             offset += nz;
618           }
619         }
620 
621         /* allocate space for the triangular factor information */
622         PetscCall(PetscNew(&upTriFactor));
623         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
624 
625         /* Create the matrix description */
626         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
627         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
628 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
629         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
630 #else
631         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
632 #endif
633         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
634         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
635 
636         /* set the matrix */
637         upTriFactor->csrMat              = new CsrMatrix;
638         upTriFactor->csrMat->num_rows    = A->rmap->n;
639         upTriFactor->csrMat->num_cols    = A->cmap->n;
640         upTriFactor->csrMat->num_entries = a->nz;
641 
642         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
643         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
644 
645         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
646         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
647 
648         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
649         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
650 
651         /* set the operation */
652         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
653 
654         /* Create the solve analysis information */
655         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
656         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
657 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
658         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
659                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
660         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
661 #endif
662 
663         /* perform the solve analysis */
664         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
665                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
666 
667         PetscCallCUDA(WaitForCUDA());
668         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
669 
670         /* assign the pointer */
671         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
672 
673         /* allocate space for the triangular factor information */
674         PetscCall(PetscNew(&loTriFactor));
675         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
676 
677         /* Create the matrix description */
678         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
679         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
680 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
681         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
682 #else
683         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
684 #endif
685         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
686         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
687 
688         /* set the operation */
689         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
690 
691         /* set the matrix */
692         loTriFactor->csrMat              = new CsrMatrix;
693         loTriFactor->csrMat->num_rows    = A->rmap->n;
694         loTriFactor->csrMat->num_cols    = A->cmap->n;
695         loTriFactor->csrMat->num_entries = a->nz;
696 
697         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
698         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
699 
700         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
701         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
702 
703         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
704         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
705 
706         /* Create the solve analysis information */
707         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
708         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
709 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
710         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
711                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
712         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
713 #endif
714 
715         /* perform the solve analysis */
716         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
717                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
718 
719         PetscCallCUDA(WaitForCUDA());
720         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
721 
722         /* assign the pointer */
723         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
724 
725         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
726         PetscCallCUDA(cudaFreeHost(AiUp));
727         PetscCallCUDA(cudaFreeHost(AjUp));
728       } else {
729         /* Fill the upper triangular matrix */
730         offset = 0;
731         for (i = 0; i < n; i++) {
732           /* set the pointers */
733           v  = aa + ai[i];
734           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
735 
736           /* first, set the diagonal elements */
737           AAUp[offset] = 1.0 / v[nz];
738           AALo[offset] = 1.0 / v[nz];
739 
740           offset += 1;
741           if (nz > 0) {
742             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
743             for (j = offset; j < offset + nz; j++) {
744               AAUp[j] = -AAUp[j];
745               AALo[j] = AAUp[j] / v[nz];
746             }
747             offset += nz;
748           }
749         }
750         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
751         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
752         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
753         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
754         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
755       }
756       PetscCallCUDA(cudaFreeHost(AAUp));
757       PetscCallCUDA(cudaFreeHost(AALo));
758     } catch (char *ex) {
759       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
760     }
761   }
762   PetscFunctionReturn(PETSC_SUCCESS);
763 }
764 
765 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
766 {
767   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
768   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
769   IS                            ip                 = a->row;
770   PetscBool                     perm_identity;
771   PetscInt                      n = A->rmap->n;
772 
773   PetscFunctionBegin;
774   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
775   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
776   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
777   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
778 
779   A->offloadmask = PETSC_OFFLOAD_BOTH;
780 
781   /* lower triangular indices */
782   PetscCall(ISIdentity(ip, &perm_identity));
783   if (!perm_identity) {
784     IS              iip;
785     const PetscInt *irip, *rip;
786 
787     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
788     PetscCall(ISGetIndices(iip, &irip));
789     PetscCall(ISGetIndices(ip, &rip));
790     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
791     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
792     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
793     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
794     PetscCall(ISRestoreIndices(iip, &irip));
795     PetscCall(ISDestroy(&iip));
796     PetscCall(ISRestoreIndices(ip, &rip));
797     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
798   }
799   PetscFunctionReturn(PETSC_SUCCESS);
800 }
801 
802 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
803 {
804   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
805   IS          ip = b->row;
806   PetscBool   perm_identity;
807 
808   PetscFunctionBegin;
809   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
810   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
811   B->offloadmask = PETSC_OFFLOAD_CPU;
812   /* determine which version of MatSolve needs to be used. */
813   PetscCall(ISIdentity(ip, &perm_identity));
814   if (perm_identity) {
815     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
816     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
817     B->ops->matsolve          = NULL;
818     B->ops->matsolvetranspose = NULL;
819   } else {
820     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
821     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
822     B->ops->matsolve          = NULL;
823     B->ops->matsolvetranspose = NULL;
824   }
825 
826   /* get the triangular factors */
827   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
828   PetscFunctionReturn(PETSC_SUCCESS);
829 }
830 
831 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
832 {
833   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
834   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
835   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
836   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
837   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
838   cusparseIndexBase_t                indexBase;
839   cusparseMatrixType_t               matrixType;
840   cusparseFillMode_t                 fillMode;
841   cusparseDiagType_t                 diagType;
842 
843   PetscFunctionBegin;
844   /* allocate space for the transpose of the lower triangular factor */
845   PetscCall(PetscNew(&loTriFactorT));
846   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
847 
848   /* set the matrix descriptors of the lower triangular factor */
849   matrixType = cusparseGetMatType(loTriFactor->descr);
850   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
851   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
852   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
853 
854   /* Create the matrix description */
855   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
856   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
857   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
858   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
859   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
860 
861   /* set the operation */
862   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
863 
864   /* allocate GPU space for the CSC of the lower triangular factor*/
865   loTriFactorT->csrMat                 = new CsrMatrix;
866   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
867   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
868   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
869   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
870   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
871   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
872 
873   /* compute the transpose of the lower triangular factor, i.e. the CSC */
874 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
875   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
876                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
877                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
878   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
879 #endif
880 
881   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
882   {
883     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
884     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
885                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
886 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
887                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
888 #else
889                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
890 #endif
891     PetscCallCUSPARSE(stat);
892   }
893 
894   PetscCallCUDA(WaitForCUDA());
895   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
896 
897   /* Create the solve analysis information */
898   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
899   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
900 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
901   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
902                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
903   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
904 #endif
905 
906   /* perform the solve analysis */
907   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
908                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
909 
910   PetscCallCUDA(WaitForCUDA());
911   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
912 
913   /* assign the pointer */
914   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
915 
916   /*********************************************/
917   /* Now the Transpose of the Upper Tri Factor */
918   /*********************************************/
919 
920   /* allocate space for the transpose of the upper triangular factor */
921   PetscCall(PetscNew(&upTriFactorT));
922   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
923 
924   /* set the matrix descriptors of the upper triangular factor */
925   matrixType = cusparseGetMatType(upTriFactor->descr);
926   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
927   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
928   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
929 
930   /* Create the matrix description */
931   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
932   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
933   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
934   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
935   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
936 
937   /* set the operation */
938   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
939 
940   /* allocate GPU space for the CSC of the upper triangular factor*/
941   upTriFactorT->csrMat                 = new CsrMatrix;
942   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
943   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
944   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
945   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
946   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
947   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
948 
949   /* compute the transpose of the upper triangular factor, i.e. the CSC */
950 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
951   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
952                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
953                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
954   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
955 #endif
956 
957   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
958   {
959     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
960     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
961                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
962 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
963                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
964 #else
965                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
966 #endif
967     PetscCallCUSPARSE(stat);
968   }
969 
970   PetscCallCUDA(WaitForCUDA());
971   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
972 
973   /* Create the solve analysis information */
974   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
975   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
976 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
977   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
978                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
979   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
980 #endif
981 
982   /* perform the solve analysis */
983   /* christ, would it have killed you to put this stuff in a function????????? */
984   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
985                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
986 
987   PetscCallCUDA(WaitForCUDA());
988   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
989 
990   /* assign the pointer */
991   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
992   PetscFunctionReturn(PETSC_SUCCESS);
993 }
994 
995 struct PetscScalarToPetscInt {
996   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
997 };
998 
999 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1000 {
1001   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1002   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1003   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1004   cusparseStatus_t              stat;
1005   cusparseIndexBase_t           indexBase;
1006 
1007   PetscFunctionBegin;
1008   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1009   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1010   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1011   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1012   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1013   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1014   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1015   PetscCall(PetscLogGpuTimeBegin());
1016   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1017   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1018     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1019     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1020     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1021     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1022     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1023 
1024     /* set alpha and beta */
1025     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1026     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1027     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1028     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1029     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1030     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1031 
1032     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1033       CsrMatrix *matrixT      = new CsrMatrix;
1034       matstructT->mat         = matrixT;
1035       matrixT->num_rows       = A->cmap->n;
1036       matrixT->num_cols       = A->rmap->n;
1037       matrixT->num_entries    = a->nz;
1038       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1039       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1040       matrixT->values         = new THRUSTARRAY(a->nz);
1041 
1042       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1043       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1044 
1045 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1046   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1047       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1048                                indexBase, cusparse_scalartype);
1049       PetscCallCUSPARSE(stat);
1050   #else
1051       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1052            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1053 
1054            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1055            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1056            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1057         */
1058       if (matrixT->num_entries) {
1059         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1060         PetscCallCUSPARSE(stat);
1061 
1062       } else {
1063         matstructT->matDescr = NULL;
1064         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1065       }
1066   #endif
1067 #endif
1068     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1069 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1070       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1071 #else
1072       CsrMatrix *temp = new CsrMatrix;
1073       CsrMatrix *tempT = new CsrMatrix;
1074       /* First convert HYB to CSR */
1075       temp->num_rows = A->rmap->n;
1076       temp->num_cols = A->cmap->n;
1077       temp->num_entries = a->nz;
1078       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1079       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1080       temp->values = new THRUSTARRAY(a->nz);
1081 
1082       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1083       PetscCallCUSPARSE(stat);
1084 
1085       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1086       tempT->num_rows = A->rmap->n;
1087       tempT->num_cols = A->cmap->n;
1088       tempT->num_entries = a->nz;
1089       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1090       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1091       tempT->values = new THRUSTARRAY(a->nz);
1092 
1093       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1094                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1095       PetscCallCUSPARSE(stat);
1096 
1097       /* Last, convert CSC to HYB */
1098       cusparseHybMat_t hybMat;
1099       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1100       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1101       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1102       PetscCallCUSPARSE(stat);
1103 
1104       /* assign the pointer */
1105       matstructT->mat = hybMat;
1106       A->transupdated = PETSC_TRUE;
1107       /* delete temporaries */
1108       if (tempT) {
1109         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1110         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1111         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1112         delete (CsrMatrix *)tempT;
1113       }
1114       if (temp) {
1115         if (temp->values) delete (THRUSTARRAY *)temp->values;
1116         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1117         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1118         delete (CsrMatrix *)temp;
1119       }
1120 #endif
1121     }
1122   }
1123   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1124     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1125     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1126     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1127     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1128     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1129     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1130     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1131     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1132     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1133     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1134     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1135       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1136       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1137       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1138     }
1139     if (!cusparsestruct->csr2csc_i) {
1140       THRUSTARRAY csr2csc_a(matrix->num_entries);
1141       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1142 
1143       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1144 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1145       void  *csr2cscBuffer;
1146       size_t csr2cscBufferSize;
1147       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1148                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1149       PetscCallCUSPARSE(stat);
1150       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1151 #endif
1152 
1153       if (matrix->num_entries) {
1154         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1155            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1156            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1157 
1158            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1159            should be filled with indexBase. So I just take a shortcut here.
1160         */
1161         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1163                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1164         PetscCallCUSPARSE(stat);
1165 #else
1166                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1167         PetscCallCUSPARSE(stat);
1168 #endif
1169       } else {
1170         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1171       }
1172 
1173       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1174       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1175 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1176       PetscCallCUDA(cudaFree(csr2cscBuffer));
1177 #endif
1178     }
1179     PetscCallThrust(
1180       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1181   }
1182   PetscCall(PetscLogGpuTimeEnd());
1183   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1184   /* the compressed row indices is not used for matTranspose */
1185   matstructT->cprowIndices = NULL;
1186   /* assign the pointer */
1187   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1188   A->transupdated                                = PETSC_TRUE;
1189   PetscFunctionReturn(PETSC_SUCCESS);
1190 }
1191 
1192 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1193 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1194 {
1195   PetscInt                              n = xx->map->n;
1196   const PetscScalar                    *barray;
1197   PetscScalar                          *xarray;
1198   thrust::device_ptr<const PetscScalar> bGPU;
1199   thrust::device_ptr<PetscScalar>       xGPU;
1200   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1201   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1202   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1203   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1204 
1205   PetscFunctionBegin;
1206   /* Analyze the matrix and create the transpose ... on the fly */
1207   if (!loTriFactorT && !upTriFactorT) {
1208     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1209     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1210     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1211   }
1212 
1213   /* Get the GPU pointers */
1214   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1215   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1216   xGPU = thrust::device_pointer_cast(xarray);
1217   bGPU = thrust::device_pointer_cast(barray);
1218 
1219   PetscCall(PetscLogGpuTimeBegin());
1220   /* First, reorder with the row permutation */
1221   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1222 
1223   /* First, solve U */
1224   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1225                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1226 
1227   /* Then, solve L */
1228   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1229                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1230 
1231   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1232   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1233 
1234   /* Copy the temporary to the full solution. */
1235   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1236 
1237   /* restore */
1238   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1239   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1240   PetscCall(PetscLogGpuTimeEnd());
1241   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
1245 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1246 {
1247   const PetscScalar                 *barray;
1248   PetscScalar                       *xarray;
1249   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1250   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1251   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1252   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1253 
1254   PetscFunctionBegin;
1255   /* Analyze the matrix and create the transpose ... on the fly */
1256   if (!loTriFactorT && !upTriFactorT) {
1257     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1258     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1259     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1260   }
1261 
1262   /* Get the GPU pointers */
1263   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1264   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1265 
1266   PetscCall(PetscLogGpuTimeBegin());
1267   /* First, solve U */
1268   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1269                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1270 
1271   /* Then, solve L */
1272   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1273                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1274 
1275   /* restore */
1276   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1277   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1278   PetscCall(PetscLogGpuTimeEnd());
1279   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1280   PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282 
1283 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1284 {
1285   const PetscScalar                    *barray;
1286   PetscScalar                          *xarray;
1287   thrust::device_ptr<const PetscScalar> bGPU;
1288   thrust::device_ptr<PetscScalar>       xGPU;
1289   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1290   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1291   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1292   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1293 
1294   PetscFunctionBegin;
1295   /* Get the GPU pointers */
1296   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1297   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1298   xGPU = thrust::device_pointer_cast(xarray);
1299   bGPU = thrust::device_pointer_cast(barray);
1300 
1301   PetscCall(PetscLogGpuTimeBegin());
1302   /* First, reorder with the row permutation */
1303   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1304 
1305   /* Next, solve L */
1306   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1307                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1308 
1309   /* Then, solve U */
1310   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1311                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1312 
1313   /* Last, reorder with the column permutation */
1314   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1315 
1316   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1317   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1318   PetscCall(PetscLogGpuTimeEnd());
1319   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1320   PetscFunctionReturn(PETSC_SUCCESS);
1321 }
1322 
1323 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1324 {
1325   const PetscScalar                 *barray;
1326   PetscScalar                       *xarray;
1327   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1328   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1330   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1331 
1332   PetscFunctionBegin;
1333   /* Get the GPU pointers */
1334   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1335   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1336 
1337   PetscCall(PetscLogGpuTimeBegin());
1338   /* First, solve L */
1339   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1340                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1341 
1342   /* Next, solve U */
1343   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1344                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1345 
1346   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1347   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1348   PetscCall(PetscLogGpuTimeEnd());
1349   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1350   PetscFunctionReturn(PETSC_SUCCESS);
1351 }
1352 
1353 #if CUSPARSE_VERSION >= 11500
1354 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1355 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1356 {
1357   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1358   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1359   const PetscScalar            *barray;
1360   PetscScalar                  *xarray;
1361 
1362   PetscFunctionBegin;
1363   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1364   PetscCall(VecCUDAGetArrayRead(b, &barray));
1365   PetscCall(PetscLogGpuTimeBegin());
1366 
1367   /* Solve L*y = b */
1368   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1369   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1370   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1371                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1372                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1373 
1374   /* Solve U*x = y */
1375   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1376   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1377                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1378 
1379   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1380   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1381 
1382   PetscCall(PetscLogGpuTimeEnd());
1383   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1384   PetscFunctionReturn(PETSC_SUCCESS);
1385 }
1386 
1387 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1388 {
1389   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1390   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1391   const PetscScalar            *barray;
1392   PetscScalar                  *xarray;
1393 
1394   PetscFunctionBegin;
1395   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1396     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1397     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1398                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1399 
1400     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1401     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1402     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1403     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1404     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1405   }
1406 
1407   if (!fs->updatedTransposeSpSVAnalysis) {
1408     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1409 
1410     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1411     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1412   }
1413 
1414   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1415   PetscCall(VecCUDAGetArrayRead(b, &barray));
1416   PetscCall(PetscLogGpuTimeBegin());
1417 
1418   /* Solve Ut*y = b */
1419   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1420   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1421   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1422                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1423 
1424   /* Solve Lt*x = y */
1425   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1426   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1427                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1428 
1429   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1430   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1431   PetscCall(PetscLogGpuTimeEnd());
1432   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1433   PetscFunctionReturn(PETSC_SUCCESS);
1434 }
1435 
1436 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1437 {
1438   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1439   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1440   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1441   CsrMatrix                    *Acsr;
1442   PetscInt                      m, nz;
1443   PetscBool                     flg;
1444 
1445   PetscFunctionBegin;
1446   if (PetscDefined(USE_DEBUG)) {
1447     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1448     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1449   }
1450 
1451   /* Copy A's value to fact */
1452   m  = fact->rmap->n;
1453   nz = aij->nz;
1454   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1455   Acsr = (CsrMatrix *)Acusp->mat->mat;
1456   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1457 
1458   /* Factorize fact inplace */
1459   if (m)
1460     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1461                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1462   if (PetscDefined(USE_DEBUG)) {
1463     int              numerical_zero;
1464     cusparseStatus_t status;
1465     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1466     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1467   }
1468 
1469   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1470      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1471   */
1472   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1473 
1474   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1475 
1476   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1477   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1478 
1479   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1480   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1481   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1482   fact->ops->matsolve          = NULL;
1483   fact->ops->matsolvetranspose = NULL;
1484   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1485   PetscFunctionReturn(PETSC_SUCCESS);
1486 }
1487 
1488 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1489 {
1490   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1491   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1492   PetscInt                      m, nz;
1493 
1494   PetscFunctionBegin;
1495   if (PetscDefined(USE_DEBUG)) {
1496     PetscInt  i;
1497     PetscBool flg, missing;
1498 
1499     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1500     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1501     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1502     PetscCall(MatMissingDiagonal(A, &missing, &i));
1503     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1504   }
1505 
1506   /* Free the old stale stuff */
1507   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1508 
1509   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1510      but they will not be used. Allocate them just for easy debugging.
1511    */
1512   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1513 
1514   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1515   fact->factortype             = MAT_FACTOR_ILU;
1516   fact->info.factor_mallocs    = 0;
1517   fact->info.fill_ratio_given  = info->fill;
1518   fact->info.fill_ratio_needed = 1.0;
1519 
1520   aij->row = NULL;
1521   aij->col = NULL;
1522 
1523   /* ====================================================================== */
1524   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1525   /* We'll do in-place factorization on fact                                */
1526   /* ====================================================================== */
1527   const int *Ai, *Aj;
1528 
1529   m  = fact->rmap->n;
1530   nz = aij->nz;
1531 
1532   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1533   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1534   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1535   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1536   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1537   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1538 
1539   /* ====================================================================== */
1540   /* Create descriptors for M, L, U                                         */
1541   /* ====================================================================== */
1542   cusparseFillMode_t fillMode;
1543   cusparseDiagType_t diagType;
1544 
1545   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1546   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1547   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1548 
1549   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1550     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1551     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1552     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1553     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1554   */
1555   fillMode = CUSPARSE_FILL_MODE_LOWER;
1556   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1557   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1558   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1559   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1560 
1561   fillMode = CUSPARSE_FILL_MODE_UPPER;
1562   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1563   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1564   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1565   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1566 
1567   /* ========================================================================= */
1568   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1569   /* ========================================================================= */
1570   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1571   if (m)
1572     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1573                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1574 
1575   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1576   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1577 
1578   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1579   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1580 
1581   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1582   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1583 
1584   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1585   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1586 
1587   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1588      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1589      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1590      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1591    */
1592   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1593     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1594     fs->spsvBuffer_L = fs->factBuffer_M;
1595     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1596   } else {
1597     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1598     fs->spsvBuffer_U = fs->factBuffer_M;
1599     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1600   }
1601 
1602   /* ========================================================================== */
1603   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1604   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1605   /* ========================================================================== */
1606   int              structural_zero;
1607   cusparseStatus_t status;
1608 
1609   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1610   if (m)
1611     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1612                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1613   if (PetscDefined(USE_DEBUG)) {
1614     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1615     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1616     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1617   }
1618 
1619   /* Estimate FLOPs of the numeric factorization */
1620   {
1621     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1622     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1623     PetscLogDouble flops = 0.0;
1624 
1625     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1626     Ai    = Aseq->i;
1627     Adiag = Aseq->diag;
1628     for (PetscInt i = 0; i < m; i++) {
1629       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1630         nzRow  = Ai[i + 1] - Ai[i];
1631         nzLeft = Adiag[i] - Ai[i];
1632         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1633           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1634         */
1635         nzLeft = (nzRow - 1) / 2;
1636         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1637       }
1638     }
1639     fs->numericFactFlops = flops;
1640   }
1641   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1642   PetscFunctionReturn(PETSC_SUCCESS);
1643 }
1644 
1645 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1646 {
1647   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1648   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1649   const PetscScalar            *barray;
1650   PetscScalar                  *xarray;
1651 
1652   PetscFunctionBegin;
1653   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1654   PetscCall(VecCUDAGetArrayRead(b, &barray));
1655   PetscCall(PetscLogGpuTimeBegin());
1656 
1657   /* Solve L*y = b */
1658   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1659   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1660   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1661                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1662 
1663   /* Solve Lt*x = y */
1664   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1665   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1666                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1667 
1668   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1669   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1670 
1671   PetscCall(PetscLogGpuTimeEnd());
1672   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1673   PetscFunctionReturn(PETSC_SUCCESS);
1674 }
1675 
1676 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1677 {
1678   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1679   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1680   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1681   CsrMatrix                    *Acsr;
1682   PetscInt                      m, nz;
1683   PetscBool                     flg;
1684 
1685   PetscFunctionBegin;
1686   if (PetscDefined(USE_DEBUG)) {
1687     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1688     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1689   }
1690 
1691   /* Copy A's value to fact */
1692   m  = fact->rmap->n;
1693   nz = aij->nz;
1694   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1695   Acsr = (CsrMatrix *)Acusp->mat->mat;
1696   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1697 
1698   /* Factorize fact inplace */
1699   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1700      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1701      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1702      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1703      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1704    */
1705   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1706   if (PetscDefined(USE_DEBUG)) {
1707     int              numerical_zero;
1708     cusparseStatus_t status;
1709     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1710     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1711   }
1712 
1713   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1714 
1715   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1716     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1717   */
1718   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1719 
1720   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1721   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1722   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1723   fact->ops->matsolve          = NULL;
1724   fact->ops->matsolvetranspose = NULL;
1725   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1726   PetscFunctionReturn(PETSC_SUCCESS);
1727 }
1728 
1729 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1730 {
1731   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1732   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1733   PetscInt                      m, nz;
1734 
1735   PetscFunctionBegin;
1736   if (PetscDefined(USE_DEBUG)) {
1737     PetscInt  i;
1738     PetscBool flg, missing;
1739 
1740     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1741     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1742     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1743     PetscCall(MatMissingDiagonal(A, &missing, &i));
1744     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1745   }
1746 
1747   /* Free the old stale stuff */
1748   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1749 
1750   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1751      but they will not be used. Allocate them just for easy debugging.
1752    */
1753   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1754 
1755   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1756   fact->factortype             = MAT_FACTOR_ICC;
1757   fact->info.factor_mallocs    = 0;
1758   fact->info.fill_ratio_given  = info->fill;
1759   fact->info.fill_ratio_needed = 1.0;
1760 
1761   aij->row = NULL;
1762   aij->col = NULL;
1763 
1764   /* ====================================================================== */
1765   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1766   /* We'll do in-place factorization on fact                                */
1767   /* ====================================================================== */
1768   const int *Ai, *Aj;
1769 
1770   m  = fact->rmap->n;
1771   nz = aij->nz;
1772 
1773   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1774   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1775   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1776   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1777   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1778   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1779 
1780   /* ====================================================================== */
1781   /* Create mat descriptors for M, L                                        */
1782   /* ====================================================================== */
1783   cusparseFillMode_t fillMode;
1784   cusparseDiagType_t diagType;
1785 
1786   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1787   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1788   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1789 
1790   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1791     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1792     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1793     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1794     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1795   */
1796   fillMode = CUSPARSE_FILL_MODE_LOWER;
1797   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1798   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1799   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1800   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1801 
1802   /* ========================================================================= */
1803   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1804   /* ========================================================================= */
1805   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1806   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1807 
1808   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1809   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1810 
1811   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1812   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1813 
1814   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1815   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1816 
1817   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1818   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1819 
1820   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1821      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1822    */
1823   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1824     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1825     fs->spsvBuffer_L = fs->factBuffer_M;
1826     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1827   } else {
1828     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1829     fs->spsvBuffer_Lt = fs->factBuffer_M;
1830     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1831   }
1832 
1833   /* ========================================================================== */
1834   /* Perform analysis of ic0 on M                                               */
1835   /* The lower triangular part of M has the same sparsity pattern as L          */
1836   /* ========================================================================== */
1837   int              structural_zero;
1838   cusparseStatus_t status;
1839 
1840   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1841   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1842   if (PetscDefined(USE_DEBUG)) {
1843     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1844     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1845     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1846   }
1847 
1848   /* Estimate FLOPs of the numeric factorization */
1849   {
1850     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1851     PetscInt      *Ai, nzRow, nzLeft;
1852     PetscLogDouble flops = 0.0;
1853 
1854     Ai = Aseq->i;
1855     for (PetscInt i = 0; i < m; i++) {
1856       nzRow = Ai[i + 1] - Ai[i];
1857       if (nzRow > 1) {
1858         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1859           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1860         */
1861         nzLeft = (nzRow - 1) / 2;
1862         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1863       }
1864     }
1865     fs->numericFactFlops = flops;
1866   }
1867   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1868   PetscFunctionReturn(PETSC_SUCCESS);
1869 }
1870 #endif
1871 
1872 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1873 {
1874   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1875 
1876   PetscFunctionBegin;
1877 #if CUSPARSE_VERSION >= 11500
1878   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1879   if (cusparseTriFactors->factorizeOnDevice) {
1880     PetscCall(ISIdentity(isrow, &row_identity));
1881     PetscCall(ISIdentity(iscol, &col_identity));
1882   }
1883   if (!info->levels && row_identity && col_identity) {
1884     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1885   } else
1886 #endif
1887   {
1888     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1889     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1890     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1891   }
1892   PetscFunctionReturn(PETSC_SUCCESS);
1893 }
1894 
1895 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1896 {
1897   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1898 
1899   PetscFunctionBegin;
1900   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1901   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1902   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1903   PetscFunctionReturn(PETSC_SUCCESS);
1904 }
1905 
1906 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1907 {
1908   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1909 
1910   PetscFunctionBegin;
1911 #if CUSPARSE_VERSION >= 11500
1912   PetscBool perm_identity = PETSC_FALSE;
1913   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1914   if (!info->levels && perm_identity) {
1915     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1916   } else
1917 #endif
1918   {
1919     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1920     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1921     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1922   }
1923   PetscFunctionReturn(PETSC_SUCCESS);
1924 }
1925 
1926 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1927 {
1928   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1929 
1930   PetscFunctionBegin;
1931   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1932   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1933   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1934   PetscFunctionReturn(PETSC_SUCCESS);
1935 }
1936 
1937 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1938 {
1939   PetscFunctionBegin;
1940   *type = MATSOLVERCUSPARSE;
1941   PetscFunctionReturn(PETSC_SUCCESS);
1942 }
1943 
1944 /*MC
1945   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1946   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1947   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1948   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1949   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1950   algorithms are not recommended. This class does NOT support direct solver operations.
1951 
1952   Level: beginner
1953 
1954 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1955 M*/
1956 
1957 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1958 {
1959   PetscInt  n = A->rmap->n;
1960   PetscBool factOnDevice, factOnHost;
1961   char     *prefix;
1962   char      factPlace[32] = "device"; /* the default */
1963 
1964   PetscFunctionBegin;
1965   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1966   PetscCall(MatSetSizes(*B, n, n, n, n));
1967   (*B)->factortype = ftype;
1968   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1969 
1970   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1971   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1972   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1973   PetscOptionsEnd();
1974   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1975   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1976   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1977   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1978 
1979   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1980   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1981     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1982     if (!A->boundtocpu) {
1983       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1984       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1985     } else {
1986       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1987       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1988     }
1989     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1990     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1991     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1992   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1993     if (!A->boundtocpu) {
1994       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1995       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1996     } else {
1997       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1998       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1999     }
2000     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2001     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2002   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2003 
2004   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2005   (*B)->canuseordering = PETSC_TRUE;
2006   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2007   PetscFunctionReturn(PETSC_SUCCESS);
2008 }
2009 
2010 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2011 {
2012   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2013   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2014 #if CUSPARSE_VERSION >= 13500
2015   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2016 #endif
2017 
2018   PetscFunctionBegin;
2019   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2020     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2021     if (A->factortype == MAT_FACTOR_NONE) {
2022       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2023       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2024     }
2025 #if CUSPARSE_VERSION >= 13500
2026     else if (fs->csrVal) {
2027       /* We have a factorized matrix on device and are able to copy it to host */
2028       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2029     }
2030 #endif
2031     else
2032       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2033     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2034     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2035     A->offloadmask = PETSC_OFFLOAD_BOTH;
2036   }
2037   PetscFunctionReturn(PETSC_SUCCESS);
2038 }
2039 
2040 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2041 {
2042   PetscFunctionBegin;
2043   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2044   *array = ((Mat_SeqAIJ *)A->data)->a;
2045   PetscFunctionReturn(PETSC_SUCCESS);
2046 }
2047 
2048 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2049 {
2050   PetscFunctionBegin;
2051   A->offloadmask = PETSC_OFFLOAD_CPU;
2052   *array         = NULL;
2053   PetscFunctionReturn(PETSC_SUCCESS);
2054 }
2055 
2056 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2057 {
2058   PetscFunctionBegin;
2059   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2060   *array = ((Mat_SeqAIJ *)A->data)->a;
2061   PetscFunctionReturn(PETSC_SUCCESS);
2062 }
2063 
2064 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2065 {
2066   PetscFunctionBegin;
2067   *array = NULL;
2068   PetscFunctionReturn(PETSC_SUCCESS);
2069 }
2070 
2071 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2072 {
2073   PetscFunctionBegin;
2074   *array = ((Mat_SeqAIJ *)A->data)->a;
2075   PetscFunctionReturn(PETSC_SUCCESS);
2076 }
2077 
2078 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2079 {
2080   PetscFunctionBegin;
2081   A->offloadmask = PETSC_OFFLOAD_CPU;
2082   *array         = NULL;
2083   PetscFunctionReturn(PETSC_SUCCESS);
2084 }
2085 
2086 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2087 {
2088   Mat_SeqAIJCUSPARSE *cusp;
2089   CsrMatrix          *matrix;
2090 
2091   PetscFunctionBegin;
2092   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2093   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2094   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2095   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2096   matrix = (CsrMatrix *)cusp->mat->mat;
2097 
2098   if (i) {
2099 #if !defined(PETSC_USE_64BIT_INDICES)
2100     *i = matrix->row_offsets->data().get();
2101 #else
2102     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2103 #endif
2104   }
2105   if (j) {
2106 #if !defined(PETSC_USE_64BIT_INDICES)
2107     *j = matrix->column_indices->data().get();
2108 #else
2109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2110 #endif
2111   }
2112   if (a) *a = matrix->values->data().get();
2113   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2114   PetscFunctionReturn(PETSC_SUCCESS);
2115 }
2116 
2117 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2118 {
2119   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2120   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2121   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2122   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2123   cusparseStatus_t              stat;
2124   PetscBool                     both = PETSC_TRUE;
2125 
2126   PetscFunctionBegin;
2127   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2128   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2129     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2130       CsrMatrix *matrix;
2131       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2132 
2133       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2134       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2135       matrix->values->assign(a->a, a->a + a->nz);
2136       PetscCallCUDA(WaitForCUDA());
2137       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2138       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2139       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2140     } else {
2141       PetscInt nnz;
2142       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2143       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2144       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2145       delete cusparsestruct->workVector;
2146       delete cusparsestruct->rowoffsets_gpu;
2147       cusparsestruct->workVector     = NULL;
2148       cusparsestruct->rowoffsets_gpu = NULL;
2149       try {
2150         if (a->compressedrow.use) {
2151           m    = a->compressedrow.nrows;
2152           ii   = a->compressedrow.i;
2153           ridx = a->compressedrow.rindex;
2154         } else {
2155           m    = A->rmap->n;
2156           ii   = a->i;
2157           ridx = NULL;
2158         }
2159         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2160         if (!a->a) {
2161           nnz  = ii[m];
2162           both = PETSC_FALSE;
2163         } else nnz = a->nz;
2164         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2165 
2166         /* create cusparse matrix */
2167         cusparsestruct->nrows = m;
2168         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2169         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2170         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2171         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2172 
2173         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2174         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2175         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2176         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2177         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2178         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2179         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2180 
2181         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2182         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2183           /* set the matrix */
2184           CsrMatrix *mat   = new CsrMatrix;
2185           mat->num_rows    = m;
2186           mat->num_cols    = A->cmap->n;
2187           mat->num_entries = nnz;
2188           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2189           mat->row_offsets->assign(ii, ii + m + 1);
2190 
2191           mat->column_indices = new THRUSTINTARRAY32(nnz);
2192           mat->column_indices->assign(a->j, a->j + nnz);
2193 
2194           mat->values = new THRUSTARRAY(nnz);
2195           if (a->a) mat->values->assign(a->a, a->a + nnz);
2196 
2197           /* assign the pointer */
2198           matstruct->mat = mat;
2199 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2200           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2201             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2202                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2203             PetscCallCUSPARSE(stat);
2204           }
2205 #endif
2206         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2207 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2208           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2209 #else
2210           CsrMatrix *mat = new CsrMatrix;
2211           mat->num_rows = m;
2212           mat->num_cols = A->cmap->n;
2213           mat->num_entries = nnz;
2214           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2215           mat->row_offsets->assign(ii, ii + m + 1);
2216 
2217           mat->column_indices = new THRUSTINTARRAY32(nnz);
2218           mat->column_indices->assign(a->j, a->j + nnz);
2219 
2220           mat->values = new THRUSTARRAY(nnz);
2221           if (a->a) mat->values->assign(a->a, a->a + nnz);
2222 
2223           cusparseHybMat_t hybMat;
2224           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2225           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2226           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2227           PetscCallCUSPARSE(stat);
2228           /* assign the pointer */
2229           matstruct->mat = hybMat;
2230 
2231           if (mat) {
2232             if (mat->values) delete (THRUSTARRAY *)mat->values;
2233             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2234             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2235             delete (CsrMatrix *)mat;
2236           }
2237 #endif
2238         }
2239 
2240         /* assign the compressed row indices */
2241         if (a->compressedrow.use) {
2242           cusparsestruct->workVector = new THRUSTARRAY(m);
2243           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2244           matstruct->cprowIndices->assign(ridx, ridx + m);
2245           tmp = m;
2246         } else {
2247           cusparsestruct->workVector = NULL;
2248           matstruct->cprowIndices    = NULL;
2249           tmp                        = 0;
2250         }
2251         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2252 
2253         /* assign the pointer */
2254         cusparsestruct->mat = matstruct;
2255       } catch (char *ex) {
2256         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2257       }
2258       PetscCallCUDA(WaitForCUDA());
2259       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2260       cusparsestruct->nonzerostate = A->nonzerostate;
2261     }
2262     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2263   }
2264   PetscFunctionReturn(PETSC_SUCCESS);
2265 }
2266 
2267 struct VecCUDAPlusEquals {
2268   template <typename Tuple>
2269   __host__ __device__ void operator()(Tuple t)
2270   {
2271     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2272   }
2273 };
2274 
2275 struct VecCUDAEquals {
2276   template <typename Tuple>
2277   __host__ __device__ void operator()(Tuple t)
2278   {
2279     thrust::get<1>(t) = thrust::get<0>(t);
2280   }
2281 };
2282 
2283 struct VecCUDAEqualsReverse {
2284   template <typename Tuple>
2285   __host__ __device__ void operator()(Tuple t)
2286   {
2287     thrust::get<0>(t) = thrust::get<1>(t);
2288   }
2289 };
2290 
2291 struct MatMatCusparse {
2292   PetscBool      cisdense;
2293   PetscScalar   *Bt;
2294   Mat            X;
2295   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2296   PetscLogDouble flops;
2297   CsrMatrix     *Bcsr;
2298 
2299 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2300   cusparseSpMatDescr_t matSpBDescr;
2301   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2302   cusparseDnMatDescr_t matBDescr;
2303   cusparseDnMatDescr_t matCDescr;
2304   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2305   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2306   void *dBuffer4;
2307   void *dBuffer5;
2308   #endif
2309   size_t                mmBufferSize;
2310   void                 *mmBuffer;
2311   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2312   cusparseSpGEMMDescr_t spgemmDesc;
2313 #endif
2314 };
2315 
2316 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2317 {
2318   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2319 
2320   PetscFunctionBegin;
2321   PetscCallCUDA(cudaFree(mmdata->Bt));
2322   delete mmdata->Bcsr;
2323 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2324   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2325   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2326   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2327   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2328   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2329   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2330   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2331   #endif
2332   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2333   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2334 #endif
2335   PetscCall(MatDestroy(&mmdata->X));
2336   PetscCall(PetscFree(data));
2337   PetscFunctionReturn(PETSC_SUCCESS);
2338 }
2339 
2340 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2341 
2342 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2343 {
2344   Mat_Product                  *product = C->product;
2345   Mat                           A, B;
2346   PetscInt                      m, n, blda, clda;
2347   PetscBool                     flg, biscuda;
2348   Mat_SeqAIJCUSPARSE           *cusp;
2349   cusparseStatus_t              stat;
2350   cusparseOperation_t           opA;
2351   const PetscScalar            *barray;
2352   PetscScalar                  *carray;
2353   MatMatCusparse               *mmdata;
2354   Mat_SeqAIJCUSPARSEMultStruct *mat;
2355   CsrMatrix                    *csrmat;
2356 
2357   PetscFunctionBegin;
2358   MatCheckProduct(C, 1);
2359   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2360   mmdata = (MatMatCusparse *)product->data;
2361   A      = product->A;
2362   B      = product->B;
2363   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2364   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2365   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2366      Instead of silently accepting the wrong answer, I prefer to raise the error */
2367   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2368   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2369   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2370   switch (product->type) {
2371   case MATPRODUCT_AB:
2372   case MATPRODUCT_PtAP:
2373     mat = cusp->mat;
2374     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2375     m   = A->rmap->n;
2376     n   = B->cmap->n;
2377     break;
2378   case MATPRODUCT_AtB:
2379     if (!A->form_explicit_transpose) {
2380       mat = cusp->mat;
2381       opA = CUSPARSE_OPERATION_TRANSPOSE;
2382     } else {
2383       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2384       mat = cusp->matTranspose;
2385       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2386     }
2387     m = A->cmap->n;
2388     n = B->cmap->n;
2389     break;
2390   case MATPRODUCT_ABt:
2391   case MATPRODUCT_RARt:
2392     mat = cusp->mat;
2393     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2394     m   = A->rmap->n;
2395     n   = B->rmap->n;
2396     break;
2397   default:
2398     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2399   }
2400   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2401   csrmat = (CsrMatrix *)mat->mat;
2402   /* if the user passed a CPU matrix, copy the data to the GPU */
2403   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2404   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2405   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2406 
2407   PetscCall(MatDenseGetLDA(B, &blda));
2408   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2409     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2410     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2411   } else {
2412     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2413     PetscCall(MatDenseGetLDA(C, &clda));
2414   }
2415 
2416   PetscCall(PetscLogGpuTimeBegin());
2417 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2418   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2419   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2420   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2421     size_t mmBufferSize;
2422     if (mmdata->initialized && mmdata->Blda != blda) {
2423       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2424       mmdata->matBDescr = NULL;
2425     }
2426     if (!mmdata->matBDescr) {
2427       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2428       mmdata->Blda = blda;
2429     }
2430 
2431     if (mmdata->initialized && mmdata->Clda != clda) {
2432       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2433       mmdata->matCDescr = NULL;
2434     }
2435     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2436       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2437       mmdata->Clda = clda;
2438     }
2439 
2440     if (!mat->matDescr) {
2441       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2442                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2443       PetscCallCUSPARSE(stat);
2444     }
2445     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2446     PetscCallCUSPARSE(stat);
2447     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2448       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2449       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2450       mmdata->mmBufferSize = mmBufferSize;
2451     }
2452     mmdata->initialized = PETSC_TRUE;
2453   } else {
2454     /* to be safe, always update pointers of the mats */
2455     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2456     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2457     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2458   }
2459 
2460   /* do cusparseSpMM, which supports transpose on B */
2461   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2462   PetscCallCUSPARSE(stat);
2463 #else
2464   PetscInt k;
2465   /* cusparseXcsrmm does not support transpose on B */
2466   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2467     cublasHandle_t cublasv2handle;
2468     cublasStatus_t cerr;
2469 
2470     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2471     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2472     PetscCallCUBLAS(cerr);
2473     blda = B->cmap->n;
2474     k = B->cmap->n;
2475   } else {
2476     k = B->rmap->n;
2477   }
2478 
2479   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2480   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2481   PetscCallCUSPARSE(stat);
2482 #endif
2483   PetscCall(PetscLogGpuTimeEnd());
2484   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2485   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2486   if (product->type == MATPRODUCT_RARt) {
2487     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2488     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2489   } else if (product->type == MATPRODUCT_PtAP) {
2490     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2491     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2492   } else {
2493     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2494   }
2495   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2496   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2497   PetscFunctionReturn(PETSC_SUCCESS);
2498 }
2499 
2500 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2501 {
2502   Mat_Product        *product = C->product;
2503   Mat                 A, B;
2504   PetscInt            m, n;
2505   PetscBool           cisdense, flg;
2506   MatMatCusparse     *mmdata;
2507   Mat_SeqAIJCUSPARSE *cusp;
2508 
2509   PetscFunctionBegin;
2510   MatCheckProduct(C, 1);
2511   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2512   A = product->A;
2513   B = product->B;
2514   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2515   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2516   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2517   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2518   switch (product->type) {
2519   case MATPRODUCT_AB:
2520     m = A->rmap->n;
2521     n = B->cmap->n;
2522     break;
2523   case MATPRODUCT_AtB:
2524     m = A->cmap->n;
2525     n = B->cmap->n;
2526     break;
2527   case MATPRODUCT_ABt:
2528     m = A->rmap->n;
2529     n = B->rmap->n;
2530     break;
2531   case MATPRODUCT_PtAP:
2532     m = B->cmap->n;
2533     n = B->cmap->n;
2534     break;
2535   case MATPRODUCT_RARt:
2536     m = B->rmap->n;
2537     n = B->rmap->n;
2538     break;
2539   default:
2540     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2541   }
2542   PetscCall(MatSetSizes(C, m, n, m, n));
2543   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2544   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2545   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2546 
2547   /* product data */
2548   PetscCall(PetscNew(&mmdata));
2549   mmdata->cisdense = cisdense;
2550 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2551   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2552   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2553 #endif
2554   /* for these products we need intermediate storage */
2555   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2556     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2557     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2558     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2559       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2560     } else {
2561       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2562     }
2563   }
2564   C->product->data    = mmdata;
2565   C->product->destroy = MatDestroy_MatMatCusparse;
2566 
2567   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2568   PetscFunctionReturn(PETSC_SUCCESS);
2569 }
2570 
2571 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2572 {
2573   Mat_Product                  *product = C->product;
2574   Mat                           A, B;
2575   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2576   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2577   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2578   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2579   PetscBool                     flg;
2580   cusparseStatus_t              stat;
2581   MatProductType                ptype;
2582   MatMatCusparse               *mmdata;
2583 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2584   cusparseSpMatDescr_t BmatSpDescr;
2585 #endif
2586   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2587 
2588   PetscFunctionBegin;
2589   MatCheckProduct(C, 1);
2590   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2591   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2592   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2593   mmdata = (MatMatCusparse *)C->product->data;
2594   A      = product->A;
2595   B      = product->B;
2596   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2597     mmdata->reusesym = PETSC_FALSE;
2598     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2599     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2600     Cmat = Ccusp->mat;
2601     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2602     Ccsr = (CsrMatrix *)Cmat->mat;
2603     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2604     goto finalize;
2605   }
2606   if (!c->nz) goto finalize;
2607   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2608   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2609   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2610   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2611   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2612   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2613   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2614   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2615   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2616   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2617   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2618   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2619   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2620   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2621 
2622   ptype = product->type;
2623   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2624     ptype = MATPRODUCT_AB;
2625     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2626   }
2627   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2628     ptype = MATPRODUCT_AB;
2629     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2630   }
2631   switch (ptype) {
2632   case MATPRODUCT_AB:
2633     Amat = Acusp->mat;
2634     Bmat = Bcusp->mat;
2635     break;
2636   case MATPRODUCT_AtB:
2637     Amat = Acusp->matTranspose;
2638     Bmat = Bcusp->mat;
2639     break;
2640   case MATPRODUCT_ABt:
2641     Amat = Acusp->mat;
2642     Bmat = Bcusp->matTranspose;
2643     break;
2644   default:
2645     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2646   }
2647   Cmat = Ccusp->mat;
2648   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2649   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2650   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2651   Acsr = (CsrMatrix *)Amat->mat;
2652   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2653   Ccsr = (CsrMatrix *)Cmat->mat;
2654   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2655   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2656   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2657   PetscCall(PetscLogGpuTimeBegin());
2658 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2659   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2660   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2661   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2662   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2663   PetscCallCUSPARSE(stat);
2664   #else
2665   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2666   PetscCallCUSPARSE(stat);
2667   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2668   PetscCallCUSPARSE(stat);
2669   #endif
2670 #else
2671   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2672                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2673   PetscCallCUSPARSE(stat);
2674 #endif
2675   PetscCall(PetscLogGpuFlops(mmdata->flops));
2676   PetscCallCUDA(WaitForCUDA());
2677   PetscCall(PetscLogGpuTimeEnd());
2678   C->offloadmask = PETSC_OFFLOAD_GPU;
2679 finalize:
2680   /* shorter version of MatAssemblyEnd_SeqAIJ */
2681   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2682   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2683   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2684   c->reallocs = 0;
2685   C->info.mallocs += 0;
2686   C->info.nz_unneeded = 0;
2687   C->assembled = C->was_assembled = PETSC_TRUE;
2688   C->num_ass++;
2689   PetscFunctionReturn(PETSC_SUCCESS);
2690 }
2691 
2692 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2693 {
2694   Mat_Product                  *product = C->product;
2695   Mat                           A, B;
2696   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2697   Mat_SeqAIJ                   *a, *b, *c;
2698   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2699   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2700   PetscInt                      i, j, m, n, k;
2701   PetscBool                     flg;
2702   cusparseStatus_t              stat;
2703   MatProductType                ptype;
2704   MatMatCusparse               *mmdata;
2705   PetscLogDouble                flops;
2706   PetscBool                     biscompressed, ciscompressed;
2707 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2708   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2709   cusparseSpMatDescr_t BmatSpDescr;
2710 #else
2711   int cnz;
2712 #endif
2713   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2714 
2715   PetscFunctionBegin;
2716   MatCheckProduct(C, 1);
2717   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2718   A = product->A;
2719   B = product->B;
2720   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2721   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2722   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2723   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2724   a = (Mat_SeqAIJ *)A->data;
2725   b = (Mat_SeqAIJ *)B->data;
2726   /* product data */
2727   PetscCall(PetscNew(&mmdata));
2728   C->product->data    = mmdata;
2729   C->product->destroy = MatDestroy_MatMatCusparse;
2730 
2731   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2732   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2733   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2734   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2735   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2736   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2737 
2738   ptype = product->type;
2739   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2740     ptype                                          = MATPRODUCT_AB;
2741     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2742   }
2743   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2744     ptype                                          = MATPRODUCT_AB;
2745     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2746   }
2747   biscompressed = PETSC_FALSE;
2748   ciscompressed = PETSC_FALSE;
2749   switch (ptype) {
2750   case MATPRODUCT_AB:
2751     m    = A->rmap->n;
2752     n    = B->cmap->n;
2753     k    = A->cmap->n;
2754     Amat = Acusp->mat;
2755     Bmat = Bcusp->mat;
2756     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2757     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2758     break;
2759   case MATPRODUCT_AtB:
2760     m = A->cmap->n;
2761     n = B->cmap->n;
2762     k = A->rmap->n;
2763     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2764     Amat = Acusp->matTranspose;
2765     Bmat = Bcusp->mat;
2766     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2767     break;
2768   case MATPRODUCT_ABt:
2769     m = A->rmap->n;
2770     n = B->rmap->n;
2771     k = A->cmap->n;
2772     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2773     Amat = Acusp->mat;
2774     Bmat = Bcusp->matTranspose;
2775     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2776     break;
2777   default:
2778     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2779   }
2780 
2781   /* create cusparse matrix */
2782   PetscCall(MatSetSizes(C, m, n, m, n));
2783   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2784   c     = (Mat_SeqAIJ *)C->data;
2785   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2786   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2787   Ccsr  = new CsrMatrix;
2788 
2789   c->compressedrow.use = ciscompressed;
2790   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2791     c->compressedrow.nrows = a->compressedrow.nrows;
2792     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2793     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2794     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2795     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2796     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2797   } else {
2798     c->compressedrow.nrows  = 0;
2799     c->compressedrow.i      = NULL;
2800     c->compressedrow.rindex = NULL;
2801     Ccusp->workVector       = NULL;
2802     Cmat->cprowIndices      = NULL;
2803   }
2804   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2805   Ccusp->mat        = Cmat;
2806   Ccusp->mat->mat   = Ccsr;
2807   Ccsr->num_rows    = Ccusp->nrows;
2808   Ccsr->num_cols    = n;
2809   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2810   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2811   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2812   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2813   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2814   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2815   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2816   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2817   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2818   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2819   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2820     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2821     c->nz                = 0;
2822     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2823     Ccsr->values         = new THRUSTARRAY(c->nz);
2824     goto finalizesym;
2825   }
2826 
2827   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2828   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2829   Acsr = (CsrMatrix *)Amat->mat;
2830   if (!biscompressed) {
2831     Bcsr = (CsrMatrix *)Bmat->mat;
2832 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2833     BmatSpDescr = Bmat->matDescr;
2834 #endif
2835   } else { /* we need to use row offsets for the full matrix */
2836     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2837     Bcsr                 = new CsrMatrix;
2838     Bcsr->num_rows       = B->rmap->n;
2839     Bcsr->num_cols       = cBcsr->num_cols;
2840     Bcsr->num_entries    = cBcsr->num_entries;
2841     Bcsr->column_indices = cBcsr->column_indices;
2842     Bcsr->values         = cBcsr->values;
2843     if (!Bcusp->rowoffsets_gpu) {
2844       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2845       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2846       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2847     }
2848     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2849     mmdata->Bcsr      = Bcsr;
2850 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2851     if (Bcsr->num_rows && Bcsr->num_cols) {
2852       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2853       PetscCallCUSPARSE(stat);
2854     }
2855     BmatSpDescr = mmdata->matSpBDescr;
2856 #endif
2857   }
2858   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2859   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2860   /* precompute flops count */
2861   if (ptype == MATPRODUCT_AB) {
2862     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2863       const PetscInt st = a->i[i];
2864       const PetscInt en = a->i[i + 1];
2865       for (j = st; j < en; j++) {
2866         const PetscInt brow = a->j[j];
2867         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2868       }
2869     }
2870   } else if (ptype == MATPRODUCT_AtB) {
2871     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2872       const PetscInt anzi = a->i[i + 1] - a->i[i];
2873       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2874       flops += (2. * anzi) * bnzi;
2875     }
2876   } else { /* TODO */
2877     flops = 0.;
2878   }
2879 
2880   mmdata->flops = flops;
2881   PetscCall(PetscLogGpuTimeBegin());
2882 
2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2885   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2886   PetscCallCUSPARSE(stat);
2887   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2888   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2889   {
2890     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2891      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2892   */
2893     void *dBuffer1 = NULL;
2894     void *dBuffer2 = NULL;
2895     void *dBuffer3 = NULL;
2896     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2897     size_t bufferSize1 = 0;
2898     size_t bufferSize2 = 0;
2899     size_t bufferSize3 = 0;
2900     size_t bufferSize4 = 0;
2901     size_t bufferSize5 = 0;
2902 
2903     /*----------------------------------------------------------------------*/
2904     /* ask bufferSize1 bytes for external memory */
2905     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2906     PetscCallCUSPARSE(stat);
2907     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2908     /* inspect the matrices A and B to understand the memory requirement for the next step */
2909     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2910     PetscCallCUSPARSE(stat);
2911 
2912     /*----------------------------------------------------------------------*/
2913     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2914     PetscCallCUSPARSE(stat);
2915     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2916     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2917     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2918     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2919     PetscCallCUSPARSE(stat);
2920     PetscCallCUDA(cudaFree(dBuffer1));
2921     PetscCallCUDA(cudaFree(dBuffer2));
2922 
2923     /*----------------------------------------------------------------------*/
2924     /* get matrix C non-zero entries C_nnz1 */
2925     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2926     c->nz = (PetscInt)C_nnz1;
2927     /* allocate matrix C */
2928     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2929     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2930     Ccsr->values = new THRUSTARRAY(c->nz);
2931     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2932     /* update matC with the new pointers */
2933     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2934     PetscCallCUSPARSE(stat);
2935 
2936     /*----------------------------------------------------------------------*/
2937     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2938     PetscCallCUSPARSE(stat);
2939     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2940     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2941     PetscCallCUSPARSE(stat);
2942     PetscCallCUDA(cudaFree(dBuffer3));
2943     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2944     PetscCallCUSPARSE(stat);
2945     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2946   }
2947   #else
2948   size_t bufSize2;
2949   /* ask bufferSize bytes for external memory */
2950   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2951   PetscCallCUSPARSE(stat);
2952   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2953   /* inspect the matrices A and B to understand the memory requirement for the next step */
2954   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2955   PetscCallCUSPARSE(stat);
2956   /* ask bufferSize again bytes for external memory */
2957   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2958   PetscCallCUSPARSE(stat);
2959   /* The CUSPARSE documentation is not clear, nor the API
2960      We need both buffers to perform the operations properly!
2961      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2962      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2963      is stored in the descriptor! What a messy API... */
2964   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2965   /* compute the intermediate product of A * B */
2966   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2967   PetscCallCUSPARSE(stat);
2968   /* get matrix C non-zero entries C_nnz1 */
2969   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2970   c->nz = (PetscInt)C_nnz1;
2971   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2972                       mmdata->mmBufferSize / 1024));
2973   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2974   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2975   Ccsr->values = new THRUSTARRAY(c->nz);
2976   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2977   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2978   PetscCallCUSPARSE(stat);
2979   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2980   PetscCallCUSPARSE(stat);
2981   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2982 #else
2983   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2984   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2985                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2986   PetscCallCUSPARSE(stat);
2987   c->nz = cnz;
2988   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2989   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2990   Ccsr->values = new THRUSTARRAY(c->nz);
2991   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2992 
2993   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2994   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2995      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2996      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2997   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2998                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2999   PetscCallCUSPARSE(stat);
3000 #endif
3001   PetscCall(PetscLogGpuFlops(mmdata->flops));
3002   PetscCall(PetscLogGpuTimeEnd());
3003 finalizesym:
3004   c->singlemalloc = PETSC_FALSE;
3005   c->free_a       = PETSC_TRUE;
3006   c->free_ij      = PETSC_TRUE;
3007   PetscCall(PetscMalloc1(m + 1, &c->i));
3008   PetscCall(PetscMalloc1(c->nz, &c->j));
3009   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3010     PetscInt      *d_i = c->i;
3011     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3012     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3013     ii = *Ccsr->row_offsets;
3014     jj = *Ccsr->column_indices;
3015     if (ciscompressed) d_i = c->compressedrow.i;
3016     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3017     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3018   } else {
3019     PetscInt *d_i = c->i;
3020     if (ciscompressed) d_i = c->compressedrow.i;
3021     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3022     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3023   }
3024   if (ciscompressed) { /* need to expand host row offsets */
3025     PetscInt r = 0;
3026     c->i[0]    = 0;
3027     for (k = 0; k < c->compressedrow.nrows; k++) {
3028       const PetscInt next = c->compressedrow.rindex[k];
3029       const PetscInt old  = c->compressedrow.i[k];
3030       for (; r < next; r++) c->i[r + 1] = old;
3031     }
3032     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3033   }
3034   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3035   PetscCall(PetscMalloc1(m, &c->ilen));
3036   PetscCall(PetscMalloc1(m, &c->imax));
3037   c->maxnz         = c->nz;
3038   c->nonzerorowcnt = 0;
3039   c->rmax          = 0;
3040   for (k = 0; k < m; k++) {
3041     const PetscInt nn = c->i[k + 1] - c->i[k];
3042     c->ilen[k] = c->imax[k] = nn;
3043     c->nonzerorowcnt += (PetscInt) !!nn;
3044     c->rmax = PetscMax(c->rmax, nn);
3045   }
3046   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3047   PetscCall(PetscMalloc1(c->nz, &c->a));
3048   Ccsr->num_entries = c->nz;
3049 
3050   C->nonzerostate++;
3051   PetscCall(PetscLayoutSetUp(C->rmap));
3052   PetscCall(PetscLayoutSetUp(C->cmap));
3053   Ccusp->nonzerostate = C->nonzerostate;
3054   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3055   C->preallocated     = PETSC_TRUE;
3056   C->assembled        = PETSC_FALSE;
3057   C->was_assembled    = PETSC_FALSE;
3058   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3059     mmdata->reusesym = PETSC_TRUE;
3060     C->offloadmask   = PETSC_OFFLOAD_GPU;
3061   }
3062   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3063   PetscFunctionReturn(PETSC_SUCCESS);
3064 }
3065 
3066 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3067 
3068 /* handles sparse or dense B */
3069 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3070 {
3071   Mat_Product *product = mat->product;
3072   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3073 
3074   PetscFunctionBegin;
3075   MatCheckProduct(mat, 1);
3076   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3077   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3078   if (product->type == MATPRODUCT_ABC) {
3079     Ciscusp = PETSC_FALSE;
3080     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3081   }
3082   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3083     PetscBool usecpu = PETSC_FALSE;
3084     switch (product->type) {
3085     case MATPRODUCT_AB:
3086       if (product->api_user) {
3087         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3088         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3089         PetscOptionsEnd();
3090       } else {
3091         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3092         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3093         PetscOptionsEnd();
3094       }
3095       break;
3096     case MATPRODUCT_AtB:
3097       if (product->api_user) {
3098         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3099         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3100         PetscOptionsEnd();
3101       } else {
3102         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3103         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3104         PetscOptionsEnd();
3105       }
3106       break;
3107     case MATPRODUCT_PtAP:
3108       if (product->api_user) {
3109         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3110         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3111         PetscOptionsEnd();
3112       } else {
3113         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3114         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3115         PetscOptionsEnd();
3116       }
3117       break;
3118     case MATPRODUCT_RARt:
3119       if (product->api_user) {
3120         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3121         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3122         PetscOptionsEnd();
3123       } else {
3124         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3125         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3126         PetscOptionsEnd();
3127       }
3128       break;
3129     case MATPRODUCT_ABC:
3130       if (product->api_user) {
3131         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3132         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3133         PetscOptionsEnd();
3134       } else {
3135         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3136         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3137         PetscOptionsEnd();
3138       }
3139       break;
3140     default:
3141       break;
3142     }
3143     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3144   }
3145   /* dispatch */
3146   if (isdense) {
3147     switch (product->type) {
3148     case MATPRODUCT_AB:
3149     case MATPRODUCT_AtB:
3150     case MATPRODUCT_ABt:
3151     case MATPRODUCT_PtAP:
3152     case MATPRODUCT_RARt:
3153       if (product->A->boundtocpu) {
3154         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3155       } else {
3156         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3157       }
3158       break;
3159     case MATPRODUCT_ABC:
3160       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3161       break;
3162     default:
3163       break;
3164     }
3165   } else if (Biscusp && Ciscusp) {
3166     switch (product->type) {
3167     case MATPRODUCT_AB:
3168     case MATPRODUCT_AtB:
3169     case MATPRODUCT_ABt:
3170       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3171       break;
3172     case MATPRODUCT_PtAP:
3173     case MATPRODUCT_RARt:
3174     case MATPRODUCT_ABC:
3175       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3176       break;
3177     default:
3178       break;
3179     }
3180   } else { /* fallback for AIJ */
3181     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3182   }
3183   PetscFunctionReturn(PETSC_SUCCESS);
3184 }
3185 
3186 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3187 {
3188   PetscFunctionBegin;
3189   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3190   PetscFunctionReturn(PETSC_SUCCESS);
3191 }
3192 
3193 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3194 {
3195   PetscFunctionBegin;
3196   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3197   PetscFunctionReturn(PETSC_SUCCESS);
3198 }
3199 
3200 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3201 {
3202   PetscFunctionBegin;
3203   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3204   PetscFunctionReturn(PETSC_SUCCESS);
3205 }
3206 
3207 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3208 {
3209   PetscFunctionBegin;
3210   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3211   PetscFunctionReturn(PETSC_SUCCESS);
3212 }
3213 
3214 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3215 {
3216   PetscFunctionBegin;
3217   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3218   PetscFunctionReturn(PETSC_SUCCESS);
3219 }
3220 
3221 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3222 {
3223   int i = blockIdx.x * blockDim.x + threadIdx.x;
3224   if (i < n) y[idx[i]] += x[i];
3225 }
3226 
3227 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3228 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3229 {
3230   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3231   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3232   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3233   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3234   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3235   PetscBool                     compressed;
3236 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3237   PetscInt nx, ny;
3238 #endif
3239 
3240   PetscFunctionBegin;
3241   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3242   if (!a->nz) {
3243     if (yy) PetscCall(VecSeq_CUDA::copy(yy, zz));
3244     else PetscCall(VecSeq_CUDA::set(zz, 0));
3245     PetscFunctionReturn(PETSC_SUCCESS);
3246   }
3247   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3248   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3249   if (!trans) {
3250     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3251     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3252   } else {
3253     if (herm || !A->form_explicit_transpose) {
3254       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3255       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3256     } else {
3257       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3258       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3259     }
3260   }
3261   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3262   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3263 
3264   try {
3265     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3266     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3267     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3268 
3269     PetscCall(PetscLogGpuTimeBegin());
3270     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3271       /* z = A x + beta y.
3272          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3273          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3274       */
3275       xptr = xarray;
3276       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3277       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3278 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3279       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3280           allocated to accommodate different uses. So we get the length info directly from mat.
3281        */
3282       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3283         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3284         nx             = mat->num_cols;
3285         ny             = mat->num_rows;
3286       }
3287 #endif
3288     } else {
3289       /* z = A^T x + beta y
3290          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3291          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3292        */
3293       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3294       dptr = zarray;
3295       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3296       if (compressed) { /* Scatter x to work vector */
3297         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3298 
3299         thrust::for_each(
3300 #if PetscDefined(HAVE_THRUST_ASYNC)
3301           thrust::cuda::par.on(PetscDefaultCudaStream),
3302 #endif
3303           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3304           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3305       }
3306 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3307       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3308         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3309         nx             = mat->num_rows;
3310         ny             = mat->num_cols;
3311       }
3312 #endif
3313     }
3314 
3315     /* csr_spmv does y = alpha op(A) x + beta y */
3316     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3317 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3318       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3319       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3320         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3321         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3322         PetscCallCUSPARSE(
3323           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3324         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3325 
3326         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3327       } else {
3328         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3329         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3330         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3331       }
3332 
3333       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3334                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3335 #else
3336       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3337       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3338 #endif
3339     } else {
3340       if (cusparsestruct->nrows) {
3341 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3342         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3343 #else
3344         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3345         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3346 #endif
3347       }
3348     }
3349     PetscCall(PetscLogGpuTimeEnd());
3350 
3351     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3352       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3353         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3354           PetscCall(VecSeq_CUDA::copy(yy, zz));      /* zz = yy */
3355         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3356           PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3357         }
3358       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3359         PetscCall(VecSeq_CUDA::set(zz, 0));
3360       }
3361 
3362       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3363       if (compressed) {
3364         PetscCall(PetscLogGpuTimeBegin());
3365         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3366            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3367            prevent that. So I just add a ScatterAdd kernel.
3368          */
3369 #if 0
3370         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3371         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3372                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3373                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3374                          VecCUDAPlusEquals());
3375 #else
3376         PetscInt n = matstruct->cprowIndices->size();
3377         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3378 #endif
3379         PetscCall(PetscLogGpuTimeEnd());
3380       }
3381     } else {
3382       if (yy && yy != zz) PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3383     }
3384     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3385     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3386     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3387   } catch (char *ex) {
3388     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3389   }
3390   if (yy) {
3391     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3392   } else {
3393     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3394   }
3395   PetscFunctionReturn(PETSC_SUCCESS);
3396 }
3397 
3398 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3399 {
3400   PetscFunctionBegin;
3401   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3402   PetscFunctionReturn(PETSC_SUCCESS);
3403 }
3404 
3405 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3406 {
3407   PetscObjectState    onnz = A->nonzerostate;
3408   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3409 
3410   PetscFunctionBegin;
3411   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3412   if (onnz != A->nonzerostate && cusp->deviceMat) {
3413     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3414     PetscCallCUDA(cudaFree(cusp->deviceMat));
3415     cusp->deviceMat = NULL;
3416   }
3417   PetscFunctionReturn(PETSC_SUCCESS);
3418 }
3419 
3420 /* --------------------------------------------------------------------------------*/
3421 /*@
3422    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3423    (the default parallel PETSc format). This matrix will ultimately pushed down
3424    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3425    assembly performance the user should preallocate the matrix storage by setting
3426    the parameter nz (or the array nnz).  By setting these parameters accurately,
3427    performance during matrix assembly can be increased by more than a factor of 50.
3428 
3429    Collective
3430 
3431    Input Parameters:
3432 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3433 .  m - number of rows
3434 .  n - number of columns
3435 .  nz - number of nonzeros per row (same for all rows)
3436 -  nnz - array containing the number of nonzeros in the various rows
3437          (possibly different for each row) or NULL
3438 
3439    Output Parameter:
3440 .  A - the matrix
3441 
3442    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3443    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3444    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3445 
3446    Notes:
3447    If nnz is given then nz is ignored
3448 
3449    The AIJ format, also called
3450    compressed row storage, is fully compatible with standard Fortran 77
3451    storage.  That is, the stored row and column indices can begin at
3452    either one (as in Fortran) or zero.  See the users' manual for details.
3453 
3454    Specify the preallocated storage with either nz or nnz (not both).
3455    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3456    allocation.  For large problems you MUST preallocate memory or you
3457    will get TERRIBLE performance, see the users' manual chapter on matrices.
3458 
3459    By default, this format uses inodes (identical nodes) when possible, to
3460    improve numerical efficiency of matrix-vector products and solves. We
3461    search for consecutive rows with the same nonzero structure, thereby
3462    reusing matrix information to achieve increased efficiency.
3463 
3464    Level: intermediate
3465 
3466 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3467 @*/
3468 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3469 {
3470   PetscFunctionBegin;
3471   PetscCall(MatCreate(comm, A));
3472   PetscCall(MatSetSizes(*A, m, n, m, n));
3473   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3474   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3475   PetscFunctionReturn(PETSC_SUCCESS);
3476 }
3477 
3478 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3479 {
3480   PetscFunctionBegin;
3481   if (A->factortype == MAT_FACTOR_NONE) {
3482     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3483   } else {
3484     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3485   }
3486   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3487   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3488   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3489   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3490   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3491   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3492   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3494   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3495   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3496   PetscCall(MatDestroy_SeqAIJ(A));
3497   PetscFunctionReturn(PETSC_SUCCESS);
3498 }
3499 
3500 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3501 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3502 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3503 {
3504   PetscFunctionBegin;
3505   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3506   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3507   PetscFunctionReturn(PETSC_SUCCESS);
3508 }
3509 
3510 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3511 {
3512   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3513   Mat_SeqAIJCUSPARSE *cy;
3514   Mat_SeqAIJCUSPARSE *cx;
3515   PetscScalar        *ay;
3516   const PetscScalar  *ax;
3517   CsrMatrix          *csry, *csrx;
3518 
3519   PetscFunctionBegin;
3520   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3521   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3522   if (X->ops->axpy != Y->ops->axpy) {
3523     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3524     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3525     PetscFunctionReturn(PETSC_SUCCESS);
3526   }
3527   /* if we are here, it means both matrices are bound to GPU */
3528   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3529   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3530   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3531   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3532   csry = (CsrMatrix *)cy->mat->mat;
3533   csrx = (CsrMatrix *)cx->mat->mat;
3534   /* see if we can turn this into a cublas axpy */
3535   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3536     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3537     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3538     if (eq) str = SAME_NONZERO_PATTERN;
3539   }
3540   /* spgeam is buggy with one column */
3541   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3542 
3543   if (str == SUBSET_NONZERO_PATTERN) {
3544     PetscScalar b = 1.0;
3545 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3546     size_t bufferSize;
3547     void  *buffer;
3548 #endif
3549 
3550     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3551     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3552     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3553 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3554     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3555                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3556     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3557     PetscCall(PetscLogGpuTimeBegin());
3558     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3559                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3560     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3561     PetscCall(PetscLogGpuTimeEnd());
3562     PetscCallCUDA(cudaFree(buffer));
3563 #else
3564     PetscCall(PetscLogGpuTimeBegin());
3565     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3566                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3567     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3568     PetscCall(PetscLogGpuTimeEnd());
3569 #endif
3570     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3571     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3572     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3573     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3574   } else if (str == SAME_NONZERO_PATTERN) {
3575     cublasHandle_t cublasv2handle;
3576     PetscBLASInt   one = 1, bnz = 1;
3577 
3578     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3579     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3580     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3581     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3582     PetscCall(PetscLogGpuTimeBegin());
3583     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3584     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3585     PetscCall(PetscLogGpuTimeEnd());
3586     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3587     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3588     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3589   } else {
3590     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3591     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3592   }
3593   PetscFunctionReturn(PETSC_SUCCESS);
3594 }
3595 
3596 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3597 {
3598   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3599   PetscScalar   *ay;
3600   cublasHandle_t cublasv2handle;
3601   PetscBLASInt   one = 1, bnz = 1;
3602 
3603   PetscFunctionBegin;
3604   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3605   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3606   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3607   PetscCall(PetscLogGpuTimeBegin());
3608   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3609   PetscCall(PetscLogGpuFlops(bnz));
3610   PetscCall(PetscLogGpuTimeEnd());
3611   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3612   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3613   PetscFunctionReturn(PETSC_SUCCESS);
3614 }
3615 
3616 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3617 {
3618   PetscBool   both = PETSC_FALSE;
3619   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3620 
3621   PetscFunctionBegin;
3622   if (A->factortype == MAT_FACTOR_NONE) {
3623     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3624     if (spptr->mat) {
3625       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3626       if (matrix->values) {
3627         both = PETSC_TRUE;
3628         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3629       }
3630     }
3631     if (spptr->matTranspose) {
3632       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3633       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3634     }
3635   }
3636   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3637   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3638   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3639   else A->offloadmask = PETSC_OFFLOAD_CPU;
3640   PetscFunctionReturn(PETSC_SUCCESS);
3641 }
3642 
3643 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3644 {
3645   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3646 
3647   PetscFunctionBegin;
3648   if (A->factortype != MAT_FACTOR_NONE) {
3649     A->boundtocpu = flg;
3650     PetscFunctionReturn(PETSC_SUCCESS);
3651   }
3652   if (flg) {
3653     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3654 
3655     A->ops->scale                     = MatScale_SeqAIJ;
3656     A->ops->axpy                      = MatAXPY_SeqAIJ;
3657     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3658     A->ops->mult                      = MatMult_SeqAIJ;
3659     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3660     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3661     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3662     A->ops->multhermitiantranspose    = NULL;
3663     A->ops->multhermitiantransposeadd = NULL;
3664     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3665     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3666     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3667     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3668     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3669     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3670     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3671     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3672   } else {
3673     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3674     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3675     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3676     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3677     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3678     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3679     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3680     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3681     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3682     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3683     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3684     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3685     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3686     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3687     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3688     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3689     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3690 
3691     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3692     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3693     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3694     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3695     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3696     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3697   }
3698   A->boundtocpu = flg;
3699   if (flg && a->inode.size) {
3700     a->inode.use = PETSC_TRUE;
3701   } else {
3702     a->inode.use = PETSC_FALSE;
3703   }
3704   PetscFunctionReturn(PETSC_SUCCESS);
3705 }
3706 
3707 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3708 {
3709   Mat B;
3710 
3711   PetscFunctionBegin;
3712   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3713   if (reuse == MAT_INITIAL_MATRIX) {
3714     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3715   } else if (reuse == MAT_REUSE_MATRIX) {
3716     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3717   }
3718   B = *newmat;
3719 
3720   PetscCall(PetscFree(B->defaultvectype));
3721   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3722 
3723   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3724     if (B->factortype == MAT_FACTOR_NONE) {
3725       Mat_SeqAIJCUSPARSE *spptr;
3726       PetscCall(PetscNew(&spptr));
3727       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3728       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3729       spptr->format = MAT_CUSPARSE_CSR;
3730 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3731   #if CUSPARSE_VERSION > 11301
3732       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3733   #else
3734       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3735   #endif
3736       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3737       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3738 #endif
3739       B->spptr = spptr;
3740     } else {
3741       Mat_SeqAIJCUSPARSETriFactors *spptr;
3742 
3743       PetscCall(PetscNew(&spptr));
3744       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3745       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3746       B->spptr = spptr;
3747     }
3748     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3749   }
3750   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3751   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3752   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3753   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3754   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3755   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3756 
3757   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3758   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3759   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3760 #if defined(PETSC_HAVE_HYPRE)
3761   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3762 #endif
3763   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3764   PetscFunctionReturn(PETSC_SUCCESS);
3765 }
3766 
3767 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3768 {
3769   PetscFunctionBegin;
3770   PetscCall(MatCreate_SeqAIJ(B));
3771   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3772   PetscFunctionReturn(PETSC_SUCCESS);
3773 }
3774 
3775 /*MC
3776    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3777 
3778    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3779    CSR, ELL, or Hybrid format.
3780    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3781 
3782    Options Database Keys:
3783 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3784 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3785 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3786 +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3787 
3788   Level: beginner
3789 
3790 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3791 M*/
3792 
3793 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3794 
3795 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3796 {
3797   PetscFunctionBegin;
3798   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3799   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3800   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3801   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3802   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3803 
3804   PetscFunctionReturn(PETSC_SUCCESS);
3805 }
3806 
3807 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3808 {
3809   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3810 
3811   PetscFunctionBegin;
3812   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3813   delete cusp->cooPerm;
3814   delete cusp->cooPerm_a;
3815   cusp->cooPerm   = NULL;
3816   cusp->cooPerm_a = NULL;
3817   if (cusp->use_extended_coo) {
3818     PetscCallCUDA(cudaFree(cusp->jmap_d));
3819     PetscCallCUDA(cudaFree(cusp->perm_d));
3820   }
3821   cusp->use_extended_coo = PETSC_FALSE;
3822   PetscFunctionReturn(PETSC_SUCCESS);
3823 }
3824 
3825 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3826 {
3827   PetscFunctionBegin;
3828   if (*cusparsestruct) {
3829     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3830     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3831     delete (*cusparsestruct)->workVector;
3832     delete (*cusparsestruct)->rowoffsets_gpu;
3833     delete (*cusparsestruct)->cooPerm;
3834     delete (*cusparsestruct)->cooPerm_a;
3835     delete (*cusparsestruct)->csr2csc_i;
3836     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3837     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3838     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3839     PetscCall(PetscFree(*cusparsestruct));
3840   }
3841   PetscFunctionReturn(PETSC_SUCCESS);
3842 }
3843 
3844 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3845 {
3846   PetscFunctionBegin;
3847   if (*mat) {
3848     delete (*mat)->values;
3849     delete (*mat)->column_indices;
3850     delete (*mat)->row_offsets;
3851     delete *mat;
3852     *mat = 0;
3853   }
3854   PetscFunctionReturn(PETSC_SUCCESS);
3855 }
3856 
3857 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3858 {
3859   PetscFunctionBegin;
3860   if (*trifactor) {
3861     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3862     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3863     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3864     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3865     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3866 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3867     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3868 #endif
3869     PetscCall(PetscFree(*trifactor));
3870   }
3871   PetscFunctionReturn(PETSC_SUCCESS);
3872 }
3873 
3874 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3875 {
3876   CsrMatrix *mat;
3877 
3878   PetscFunctionBegin;
3879   if (*matstruct) {
3880     if ((*matstruct)->mat) {
3881       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3883         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3884 #else
3885         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3886         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3887 #endif
3888       } else {
3889         mat = (CsrMatrix *)(*matstruct)->mat;
3890         PetscCall(CsrMatrix_Destroy(&mat));
3891       }
3892     }
3893     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3894     delete (*matstruct)->cprowIndices;
3895     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3896     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3897     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3898 
3899 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3900     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3901     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3902     for (int i = 0; i < 3; i++) {
3903       if (mdata->cuSpMV[i].initialized) {
3904         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3905         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3906         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3907       }
3908     }
3909 #endif
3910     delete *matstruct;
3911     *matstruct = NULL;
3912   }
3913   PetscFunctionReturn(PETSC_SUCCESS);
3914 }
3915 
3916 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3917 {
3918   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3919 
3920   PetscFunctionBegin;
3921   if (fs) {
3922     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3923     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3924     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3925     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3926     delete fs->rpermIndices;
3927     delete fs->cpermIndices;
3928     delete fs->workVector;
3929     fs->rpermIndices = NULL;
3930     fs->cpermIndices = NULL;
3931     fs->workVector   = NULL;
3932     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3933     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3934     fs->init_dev_prop = PETSC_FALSE;
3935 #if CUSPARSE_VERSION >= 11500
3936     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3937     PetscCallCUDA(cudaFree(fs->csrColIdx));
3938     PetscCallCUDA(cudaFree(fs->csrVal));
3939     PetscCallCUDA(cudaFree(fs->X));
3940     PetscCallCUDA(cudaFree(fs->Y));
3941     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3942     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3943     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3944     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3945     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3946     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3947     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3948     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3949     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3950     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3951     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3952     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3953     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3954     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3955     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3956     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3957 
3958     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3959     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3960 #endif
3961   }
3962   PetscFunctionReturn(PETSC_SUCCESS);
3963 }
3964 
3965 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3966 {
3967   PetscFunctionBegin;
3968   if (*trifactors) {
3969     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3970     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3971     PetscCall(PetscFree(*trifactors));
3972   }
3973   PetscFunctionReturn(PETSC_SUCCESS);
3974 }
3975 
3976 struct IJCompare {
3977   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3978   {
3979     if (t1.get<0>() < t2.get<0>()) return true;
3980     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3981     return false;
3982   }
3983 };
3984 
3985 struct IJEqual {
3986   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3987   {
3988     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3989     return true;
3990   }
3991 };
3992 
3993 struct IJDiff {
3994   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3995 };
3996 
3997 struct IJSum {
3998   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3999 };
4000 
4001 #include <thrust/iterator/discard_iterator.h>
4002 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4003 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4004 {
4005   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4006   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4007   THRUSTARRAY                          *cooPerm_v = NULL;
4008   thrust::device_ptr<const PetscScalar> d_v;
4009   CsrMatrix                            *matrix;
4010   PetscInt                              n;
4011 
4012   PetscFunctionBegin;
4013   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4014   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4015   if (!cusp->cooPerm) {
4016     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4017     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4018     PetscFunctionReturn(PETSC_SUCCESS);
4019   }
4020   matrix = (CsrMatrix *)cusp->mat->mat;
4021   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4022   if (!v) {
4023     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4024     goto finalize;
4025   }
4026   n = cusp->cooPerm->size();
4027   if (isCudaMem(v)) {
4028     d_v = thrust::device_pointer_cast(v);
4029   } else {
4030     cooPerm_v = new THRUSTARRAY(n);
4031     cooPerm_v->assign(v, v + n);
4032     d_v = cooPerm_v->data();
4033     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4034   }
4035   PetscCall(PetscLogGpuTimeBegin());
4036   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4037     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4038       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4039       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4040       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4041         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4042         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4043       */
4044       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4045       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4046       delete cooPerm_w;
4047     } else {
4048       /* all nonzeros in d_v[] are unique entries */
4049       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4050       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4051       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4052     }
4053   } else {
4054     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4055       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4056       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4057     } else {
4058       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4059       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4060       thrust::for_each(zibit, zieit, VecCUDAEquals());
4061     }
4062   }
4063   PetscCall(PetscLogGpuTimeEnd());
4064 finalize:
4065   delete cooPerm_v;
4066   A->offloadmask = PETSC_OFFLOAD_GPU;
4067   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4068   /* shorter version of MatAssemblyEnd_SeqAIJ */
4069   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4070   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4071   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4072   a->reallocs = 0;
4073   A->info.mallocs += 0;
4074   A->info.nz_unneeded = 0;
4075   A->assembled = A->was_assembled = PETSC_TRUE;
4076   A->num_ass++;
4077   PetscFunctionReturn(PETSC_SUCCESS);
4078 }
4079 
4080 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4081 {
4082   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4083 
4084   PetscFunctionBegin;
4085   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4086   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4087   if (destroy) {
4088     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4089     delete cusp->csr2csc_i;
4090     cusp->csr2csc_i = NULL;
4091   }
4092   A->transupdated = PETSC_FALSE;
4093   PetscFunctionReturn(PETSC_SUCCESS);
4094 }
4095 
4096 #include <thrust/binary_search.h>
4097 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4098 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4099 {
4100   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4101   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4102   PetscInt            cooPerm_n, nzr = 0;
4103 
4104   PetscFunctionBegin;
4105   PetscCall(PetscLayoutSetUp(A->rmap));
4106   PetscCall(PetscLayoutSetUp(A->cmap));
4107   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4108   if (n != cooPerm_n) {
4109     delete cusp->cooPerm;
4110     delete cusp->cooPerm_a;
4111     cusp->cooPerm   = NULL;
4112     cusp->cooPerm_a = NULL;
4113   }
4114   if (n) {
4115     thrust::device_ptr<PetscInt> d_i, d_j;
4116     PetscInt                    *d_raw_i, *d_raw_j;
4117     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4118     PetscMemType                 imtype, jmtype;
4119 
4120     PetscCall(PetscGetMemType(coo_i, &imtype));
4121     if (PetscMemTypeHost(imtype)) {
4122       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4123       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4124       d_i        = thrust::device_pointer_cast(d_raw_i);
4125       free_raw_i = PETSC_TRUE;
4126       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4127     } else {
4128       d_i = thrust::device_pointer_cast(coo_i);
4129     }
4130 
4131     PetscCall(PetscGetMemType(coo_j, &jmtype));
4132     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4133       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4134       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4135       d_j        = thrust::device_pointer_cast(d_raw_j);
4136       free_raw_j = PETSC_TRUE;
4137       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4138     } else {
4139       d_j = thrust::device_pointer_cast(coo_j);
4140     }
4141 
4142     THRUSTINTARRAY ii(A->rmap->n);
4143 
4144     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4145     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4146 
4147     /* Ex.
4148       n = 6
4149       coo_i = [3,3,1,4,1,4]
4150       coo_j = [3,2,2,5,2,6]
4151     */
4152     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4153     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4154 
4155     PetscCall(PetscLogGpuTimeBegin());
4156     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4157     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4158     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4159     THRUSTINTARRAY w(d_j, d_j + n);
4160 
4161     /*
4162       d_i     = [1,1,3,3,4,4]
4163       d_j     = [2,2,2,3,5,6]
4164       cooPerm = [2,4,1,0,3,5]
4165     */
4166     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4167 
4168     /*
4169       d_i     = [1,3,3,4,4,x]
4170                             ^ekey
4171       d_j     = [2,2,3,5,6,x]
4172                            ^nekye
4173     */
4174     if (nekey == ekey) { /* all entries are unique */
4175       delete cusp->cooPerm_a;
4176       cusp->cooPerm_a = NULL;
4177     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4178       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4179       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4180       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4181       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4182       w[0]                  = 0;
4183       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4184       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4185     }
4186     thrust::counting_iterator<PetscInt> search_begin(0);
4187     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4188                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4189                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4190     PetscCall(PetscLogGpuTimeEnd());
4191 
4192     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4193     a->singlemalloc = PETSC_FALSE;
4194     a->free_a       = PETSC_TRUE;
4195     a->free_ij      = PETSC_TRUE;
4196     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4197     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4198     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4199     a->nz = a->maxnz = a->i[A->rmap->n];
4200     a->rmax          = 0;
4201     PetscCall(PetscMalloc1(a->nz, &a->a));
4202     PetscCall(PetscMalloc1(a->nz, &a->j));
4203     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4204     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4205     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4206     for (PetscInt i = 0; i < A->rmap->n; i++) {
4207       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4208       nzr += (PetscInt) !!(nnzr);
4209       a->ilen[i] = a->imax[i] = nnzr;
4210       a->rmax                 = PetscMax(a->rmax, nnzr);
4211     }
4212     a->nonzerorowcnt = nzr;
4213     A->preallocated  = PETSC_TRUE;
4214     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4215     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4216     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4217     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4218   } else {
4219     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4220   }
4221   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4222 
4223   /* We want to allocate the CUSPARSE struct for matvec now.
4224      The code is so convoluted now that I prefer to copy zeros */
4225   PetscCall(PetscArrayzero(a->a, a->nz));
4226   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4227   A->offloadmask = PETSC_OFFLOAD_CPU;
4228   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4229   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4230   PetscFunctionReturn(PETSC_SUCCESS);
4231 }
4232 
4233 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4234 {
4235   Mat_SeqAIJ         *seq;
4236   Mat_SeqAIJCUSPARSE *dev;
4237   PetscBool           coo_basic = PETSC_TRUE;
4238   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4239 
4240   PetscFunctionBegin;
4241   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4242   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4243   if (coo_i) {
4244     PetscCall(PetscGetMemType(coo_i, &mtype));
4245     if (PetscMemTypeHost(mtype)) {
4246       for (PetscCount k = 0; k < coo_n; k++) {
4247         if (coo_i[k] < 0 || coo_j[k] < 0) {
4248           coo_basic = PETSC_FALSE;
4249           break;
4250         }
4251       }
4252     }
4253   }
4254 
4255   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4256     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4257   } else {
4258     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4259     mat->offloadmask = PETSC_OFFLOAD_CPU;
4260     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4261     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4262     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4263     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4264     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4265     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4266     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4267     dev->use_extended_coo = PETSC_TRUE;
4268   }
4269   PetscFunctionReturn(PETSC_SUCCESS);
4270 }
4271 
4272 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4273 {
4274   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4275   const PetscCount grid_size = gridDim.x * blockDim.x;
4276   for (; i < nnz; i += grid_size) {
4277     PetscScalar sum = 0.0;
4278     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4279     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4280   }
4281 }
4282 
4283 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4284 {
4285   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4286   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4287   PetscCount          Annz = seq->nz;
4288   PetscMemType        memtype;
4289   const PetscScalar  *v1 = v;
4290   PetscScalar        *Aa;
4291 
4292   PetscFunctionBegin;
4293   if (dev->use_extended_coo) {
4294     PetscCall(PetscGetMemType(v, &memtype));
4295     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4296       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4297       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4298     }
4299 
4300     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4301     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4302 
4303     if (Annz) {
4304       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4305       PetscCallCUDA(cudaPeekAtLastError());
4306     }
4307 
4308     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4309     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4310 
4311     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4312   } else {
4313     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4314   }
4315   PetscFunctionReturn(PETSC_SUCCESS);
4316 }
4317 
4318 /*@C
4319     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
4320 
4321    Not collective
4322 
4323     Input Parameters:
4324 +   A - the matrix
4325 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4326 
4327     Output Parameters:
4328 +   ia - the CSR row pointers
4329 -   ja - the CSR column indices
4330 
4331     Level: developer
4332 
4333     Note:
4334       When compressed is true, the CSR structure does not contain empty rows
4335 
4336 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4337 @*/
4338 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4339 {
4340   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4341   CsrMatrix          *csr;
4342   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4343 
4344   PetscFunctionBegin;
4345   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4346   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4347   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4348   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4349   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4350   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4351   csr = (CsrMatrix *)cusp->mat->mat;
4352   if (i) {
4353     if (!compressed && a->compressedrow.use) { /* need full row offset */
4354       if (!cusp->rowoffsets_gpu) {
4355         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4356         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4357         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4358       }
4359       *i = cusp->rowoffsets_gpu->data().get();
4360     } else *i = csr->row_offsets->data().get();
4361   }
4362   if (j) *j = csr->column_indices->data().get();
4363   PetscFunctionReturn(PETSC_SUCCESS);
4364 }
4365 
4366 /*@C
4367     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4368 
4369    Not collective
4370 
4371     Input Parameters:
4372 +   A - the matrix
4373 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4374 
4375     Output Parameters:
4376 +   ia - the CSR row pointers
4377 -   ja - the CSR column indices
4378 
4379     Level: developer
4380 
4381 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4382 @*/
4383 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j)
4384 {
4385   PetscFunctionBegin;
4386   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4387   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4388   if (i) *i = NULL;
4389   if (j) *j = NULL;
4390   PetscFunctionReturn(PETSC_SUCCESS);
4391 }
4392 
4393 /*@C
4394    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4395 
4396    Not Collective
4397 
4398    Input Parameter:
4399 .   A - a `MATSEQAIJCUSPARSE` matrix
4400 
4401    Output Parameter:
4402 .   a - pointer to the device data
4403 
4404    Level: developer
4405 
4406    Note:
4407    May trigger host-device copies if up-to-date matrix data is on host
4408 
4409 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4410 @*/
4411 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4412 {
4413   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4414   CsrMatrix          *csr;
4415 
4416   PetscFunctionBegin;
4417   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4418   PetscValidPointer(a, 2);
4419   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4420   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4421   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4422   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4423   csr = (CsrMatrix *)cusp->mat->mat;
4424   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4425   *a = csr->values->data().get();
4426   PetscFunctionReturn(PETSC_SUCCESS);
4427 }
4428 
4429 /*@C
4430    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4431 
4432    Not Collective
4433 
4434    Input Parameter:
4435 .   A - a `MATSEQAIJCUSPARSE` matrix
4436 
4437    Output Parameter:
4438 .   a - pointer to the device data
4439 
4440    Level: developer
4441 
4442 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4443 @*/
4444 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4445 {
4446   PetscFunctionBegin;
4447   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4448   PetscValidPointer(a, 2);
4449   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4450   *a = NULL;
4451   PetscFunctionReturn(PETSC_SUCCESS);
4452 }
4453 
4454 /*@C
4455    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4456 
4457    Not Collective
4458 
4459    Input Parameter:
4460 .   A - a `MATSEQAIJCUSPARSE` matrix
4461 
4462    Output Parameter:
4463 .   a - pointer to the device data
4464 
4465    Level: developer
4466 
4467    Note:
4468    May trigger host-device copies if up-to-date matrix data is on host
4469 
4470 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4471 @*/
4472 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4473 {
4474   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4475   CsrMatrix          *csr;
4476 
4477   PetscFunctionBegin;
4478   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4479   PetscValidPointer(a, 2);
4480   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4481   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4482   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4483   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4484   csr = (CsrMatrix *)cusp->mat->mat;
4485   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4486   *a             = csr->values->data().get();
4487   A->offloadmask = PETSC_OFFLOAD_GPU;
4488   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4489   PetscFunctionReturn(PETSC_SUCCESS);
4490 }
4491 /*@C
4492    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4493 
4494    Not Collective
4495 
4496    Input Parameter:
4497 .   A - a `MATSEQAIJCUSPARSE` matrix
4498 
4499    Output Parameter:
4500 .   a - pointer to the device data
4501 
4502    Level: developer
4503 
4504 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4505 @*/
4506 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4507 {
4508   PetscFunctionBegin;
4509   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4510   PetscValidPointer(a, 2);
4511   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4512   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4513   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4514   *a = NULL;
4515   PetscFunctionReturn(PETSC_SUCCESS);
4516 }
4517 
4518 /*@C
4519    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4520 
4521    Not Collective
4522 
4523    Input Parameter:
4524 .   A - a `MATSEQAIJCUSPARSE` matrix
4525 
4526    Output Parameter:
4527 .   a - pointer to the device data
4528 
4529    Level: developer
4530 
4531    Note:
4532    Does not trigger host-device copies and flags data validity on the GPU
4533 
4534 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4535 @*/
4536 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4537 {
4538   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4539   CsrMatrix          *csr;
4540 
4541   PetscFunctionBegin;
4542   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4543   PetscValidPointer(a, 2);
4544   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4545   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4546   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4547   csr = (CsrMatrix *)cusp->mat->mat;
4548   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4549   *a             = csr->values->data().get();
4550   A->offloadmask = PETSC_OFFLOAD_GPU;
4551   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4552   PetscFunctionReturn(PETSC_SUCCESS);
4553 }
4554 
4555 /*@C
4556    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4557 
4558    Not Collective
4559 
4560    Input Parameter:
4561 .   A - a `MATSEQAIJCUSPARSE` matrix
4562 
4563    Output Parameter:
4564 .   a - pointer to the device data
4565 
4566    Level: developer
4567 
4568 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4569 @*/
4570 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4571 {
4572   PetscFunctionBegin;
4573   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4574   PetscValidPointer(a, 2);
4575   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4576   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4577   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4578   *a = NULL;
4579   PetscFunctionReturn(PETSC_SUCCESS);
4580 }
4581 
4582 struct IJCompare4 {
4583   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4584   {
4585     if (t1.get<0>() < t2.get<0>()) return true;
4586     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4587     return false;
4588   }
4589 };
4590 
4591 struct Shift {
4592   int _shift;
4593 
4594   Shift(int shift) : _shift(shift) { }
4595   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4596 };
4597 
4598 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4599 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4600 {
4601   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4602   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4603   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4604   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4605   PetscInt                      Annz, Bnnz;
4606   cusparseStatus_t              stat;
4607   PetscInt                      i, m, n, zero = 0;
4608 
4609   PetscFunctionBegin;
4610   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4611   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4612   PetscValidPointer(C, 4);
4613   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4614   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4615   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4616   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4617   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4618   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4619   if (reuse == MAT_INITIAL_MATRIX) {
4620     m = A->rmap->n;
4621     n = A->cmap->n + B->cmap->n;
4622     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4623     PetscCall(MatSetSizes(*C, m, n, m, n));
4624     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4625     c                       = (Mat_SeqAIJ *)(*C)->data;
4626     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4627     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4628     Ccsr                    = new CsrMatrix;
4629     Cmat->cprowIndices      = NULL;
4630     c->compressedrow.use    = PETSC_FALSE;
4631     c->compressedrow.nrows  = 0;
4632     c->compressedrow.i      = NULL;
4633     c->compressedrow.rindex = NULL;
4634     Ccusp->workVector       = NULL;
4635     Ccusp->nrows            = m;
4636     Ccusp->mat              = Cmat;
4637     Ccusp->mat->mat         = Ccsr;
4638     Ccsr->num_rows          = m;
4639     Ccsr->num_cols          = n;
4640     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4641     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4642     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4643     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4644     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4645     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4646     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4647     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4648     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4649     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4650     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4651     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4652     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4653 
4654     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4655     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4656     Annz                 = (PetscInt)Acsr->column_indices->size();
4657     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4658     c->nz                = Annz + Bnnz;
4659     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4660     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4661     Ccsr->values         = new THRUSTARRAY(c->nz);
4662     Ccsr->num_entries    = c->nz;
4663     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4664     if (c->nz) {
4665       auto              Acoo = new THRUSTINTARRAY32(Annz);
4666       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4667       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4668       THRUSTINTARRAY32 *Aroff, *Broff;
4669 
4670       if (a->compressedrow.use) { /* need full row offset */
4671         if (!Acusp->rowoffsets_gpu) {
4672           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4673           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4674           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4675         }
4676         Aroff = Acusp->rowoffsets_gpu;
4677       } else Aroff = Acsr->row_offsets;
4678       if (b->compressedrow.use) { /* need full row offset */
4679         if (!Bcusp->rowoffsets_gpu) {
4680           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4681           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4682           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4683         }
4684         Broff = Bcusp->rowoffsets_gpu;
4685       } else Broff = Bcsr->row_offsets;
4686       PetscCall(PetscLogGpuTimeBegin());
4687       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4688       PetscCallCUSPARSE(stat);
4689       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4690       PetscCallCUSPARSE(stat);
4691       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4692       auto Aperm = thrust::make_constant_iterator(1);
4693       auto Bperm = thrust::make_constant_iterator(0);
4694 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4695       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4696       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4697 #else
4698       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4699       auto Bcib = Bcsr->column_indices->begin();
4700       auto Bcie = Bcsr->column_indices->end();
4701       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4702 #endif
4703       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4704       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4705       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4706       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4707       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4708       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4709       auto p1    = Ccusp->cooPerm->begin();
4710       auto p2    = Ccusp->cooPerm->begin();
4711       thrust::advance(p2, Annz);
4712       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4713 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4714       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4715 #endif
4716       auto cci = thrust::make_counting_iterator(zero);
4717       auto cce = thrust::make_counting_iterator(c->nz);
4718 #if 0 //Errors on SUMMIT cuda 11.1.0
4719       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4720 #else
4721       auto pred = thrust::identity<int>();
4722       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4723       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4724 #endif
4725       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4726       PetscCallCUSPARSE(stat);
4727       PetscCall(PetscLogGpuTimeEnd());
4728       delete wPerm;
4729       delete Acoo;
4730       delete Bcoo;
4731       delete Ccoo;
4732 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4733       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4734       PetscCallCUSPARSE(stat);
4735 #endif
4736       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4737         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4738         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4739         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4740         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4741         CsrMatrix                    *CcsrT = new CsrMatrix;
4742         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4743         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4744 
4745         (*C)->form_explicit_transpose = PETSC_TRUE;
4746         (*C)->transupdated            = PETSC_TRUE;
4747         Ccusp->rowoffsets_gpu         = NULL;
4748         CmatT->cprowIndices           = NULL;
4749         CmatT->mat                    = CcsrT;
4750         CcsrT->num_rows               = n;
4751         CcsrT->num_cols               = m;
4752         CcsrT->num_entries            = c->nz;
4753 
4754         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4755         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4756         CcsrT->values         = new THRUSTARRAY(c->nz);
4757 
4758         PetscCall(PetscLogGpuTimeBegin());
4759         auto rT = CcsrT->row_offsets->begin();
4760         if (AT) {
4761           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4762           thrust::advance(rT, -1);
4763         }
4764         if (BT) {
4765           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4766           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4767           thrust::copy(titb, tite, rT);
4768         }
4769         auto cT = CcsrT->column_indices->begin();
4770         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4771         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4772         auto vT = CcsrT->values->begin();
4773         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4774         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4775         PetscCall(PetscLogGpuTimeEnd());
4776 
4777         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4778         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4779         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4780         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4781         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4782         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4783         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4784         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4785         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4786 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4787         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4788         PetscCallCUSPARSE(stat);
4789 #endif
4790         Ccusp->matTranspose = CmatT;
4791       }
4792     }
4793 
4794     c->singlemalloc = PETSC_FALSE;
4795     c->free_a       = PETSC_TRUE;
4796     c->free_ij      = PETSC_TRUE;
4797     PetscCall(PetscMalloc1(m + 1, &c->i));
4798     PetscCall(PetscMalloc1(c->nz, &c->j));
4799     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4800       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4801       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4802       ii = *Ccsr->row_offsets;
4803       jj = *Ccsr->column_indices;
4804       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4805       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4806     } else {
4807       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4808       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4809     }
4810     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4811     PetscCall(PetscMalloc1(m, &c->ilen));
4812     PetscCall(PetscMalloc1(m, &c->imax));
4813     c->maxnz         = c->nz;
4814     c->nonzerorowcnt = 0;
4815     c->rmax          = 0;
4816     for (i = 0; i < m; i++) {
4817       const PetscInt nn = c->i[i + 1] - c->i[i];
4818       c->ilen[i] = c->imax[i] = nn;
4819       c->nonzerorowcnt += (PetscInt) !!nn;
4820       c->rmax = PetscMax(c->rmax, nn);
4821     }
4822     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4823     PetscCall(PetscMalloc1(c->nz, &c->a));
4824     (*C)->nonzerostate++;
4825     PetscCall(PetscLayoutSetUp((*C)->rmap));
4826     PetscCall(PetscLayoutSetUp((*C)->cmap));
4827     Ccusp->nonzerostate = (*C)->nonzerostate;
4828     (*C)->preallocated  = PETSC_TRUE;
4829   } else {
4830     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4831     c = (Mat_SeqAIJ *)(*C)->data;
4832     if (c->nz) {
4833       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4834       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4835       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4836       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4837       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4838       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4839       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4840       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4841       Acsr = (CsrMatrix *)Acusp->mat->mat;
4842       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4843       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4844       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4845       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4846       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4847       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4848       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4849       auto pmid = Ccusp->cooPerm->begin();
4850       thrust::advance(pmid, Acsr->num_entries);
4851       PetscCall(PetscLogGpuTimeBegin());
4852       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4853       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4854       thrust::for_each(zibait, zieait, VecCUDAEquals());
4855       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4856       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4857       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4858       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4859       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4860         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4861         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4862         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4863         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4864         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4865         auto       vT    = CcsrT->values->begin();
4866         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4867         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4868         (*C)->transupdated = PETSC_TRUE;
4869       }
4870       PetscCall(PetscLogGpuTimeEnd());
4871     }
4872   }
4873   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4874   (*C)->assembled     = PETSC_TRUE;
4875   (*C)->was_assembled = PETSC_FALSE;
4876   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4877   PetscFunctionReturn(PETSC_SUCCESS);
4878 }
4879 
4880 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4881 {
4882   bool               dmem;
4883   const PetscScalar *av;
4884 
4885   PetscFunctionBegin;
4886   dmem = isCudaMem(v);
4887   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4888   if (n && idx) {
4889     THRUSTINTARRAY widx(n);
4890     widx.assign(idx, idx + n);
4891     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4892 
4893     THRUSTARRAY                    *w = NULL;
4894     thrust::device_ptr<PetscScalar> dv;
4895     if (dmem) {
4896       dv = thrust::device_pointer_cast(v);
4897     } else {
4898       w  = new THRUSTARRAY(n);
4899       dv = w->data();
4900     }
4901     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4902 
4903     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4904     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4905     thrust::for_each(zibit, zieit, VecCUDAEquals());
4906     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4907     delete w;
4908   } else {
4909     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4910   }
4911   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4912   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4913   PetscFunctionReturn(PETSC_SUCCESS);
4914 }
4915