xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d5b43468fb8780a8feea140ccd6fa3e6a50411cc)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #if PETSC_CPP_VERSION >= 14
17   #define PETSC_HAVE_THRUST_ASYNC 1
18   // thrust::for_each(thrust::cuda::par.on()) requires C++14
19   #include <thrust/async/for_each.h>
20 #endif
21 #include <thrust/iterator/constant_iterator.h>
22 #include <thrust/remove.h>
23 #include <thrust/sort.h>
24 #include <thrust/unique.h>
25 
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 
67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70 
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
74 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
85 
86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
91 
92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
94 
95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98 
99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100 {
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
102 
103   PetscFunctionBegin;
104   switch (op) {
105   case MAT_CUSPARSE_MULT:
106     cusparsestruct->format = format;
107     break;
108   case MAT_CUSPARSE_ALL:
109     cusparsestruct->format = format;
110     break;
111   default:
112     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113   }
114   PetscFunctionReturn(0);
115 }
116 
117 /*@
118    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
119    operation. Only the `MatMult()` operation can use different GPU storage formats
120 
121    Not Collective
122 
123    Input Parameters:
124 +  A - Matrix of type `MATSEQAIJCUSPARSE`
125 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
126         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
127 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128 
129    Output Parameter:
130 
131    Level: intermediate
132 
133 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
134 @*/
135 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
136 {
137   PetscFunctionBegin;
138   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
139   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
140   PetscFunctionReturn(0);
141 }
142 
143 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
144 {
145   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
146 
147   PetscFunctionBegin;
148   cusparsestruct->use_cpu_solve = use_cpu;
149   PetscFunctionReturn(0);
150 }
151 
152 /*@
153    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
154 
155    Input Parameters:
156 +  A - Matrix of type `MATSEQAIJCUSPARSE`
157 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
158 
159    Output Parameter:
160 
161    Note:
162    The cuSparse LU solver currently computes the factors with the built-in CPU method
163    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
164    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
165 
166    Level: intermediate
167 
168 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
169 @*/
170 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
171 {
172   PetscFunctionBegin;
173   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
174   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
175   PetscFunctionReturn(0);
176 }
177 
178 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
179 {
180   PetscFunctionBegin;
181   switch (op) {
182   case MAT_FORM_EXPLICIT_TRANSPOSE:
183     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
184     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
185     A->form_explicit_transpose = flg;
186     break;
187   default:
188     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
189     break;
190   }
191   PetscFunctionReturn(0);
192 }
193 
194 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
195 
196 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
197 {
198   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
199   IS                  isrow = b->row, iscol = b->col;
200   PetscBool           row_identity, col_identity;
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
202 
203   PetscFunctionBegin;
204   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
205   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
206   B->offloadmask = PETSC_OFFLOAD_CPU;
207   /* determine which version of MatSolve needs to be used. */
208   PetscCall(ISIdentity(isrow, &row_identity));
209   PetscCall(ISIdentity(iscol, &col_identity));
210 
211   if (!cusparsestruct->use_cpu_solve) {
212     if (row_identity && col_identity) {
213       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
214       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
215     } else {
216       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
217       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
218     }
219   }
220   B->ops->matsolve          = NULL;
221   B->ops->matsolvetranspose = NULL;
222 
223   /* get the triangular factors */
224   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
229 {
230   MatCUSPARSEStorageFormat format;
231   PetscBool                flg;
232   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
233 
234   PetscFunctionBegin;
235   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
236   if (A->factortype == MAT_FACTOR_NONE) {
237     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
238     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
239 
240     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
241     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
242     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
243     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
244 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
245     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247   #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249   #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251   #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
253     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
254 
255     PetscCall(
256       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
257     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
258 #endif
259   }
260   PetscOptionsHeadEnd();
261   PetscFunctionReturn(0);
262 }
263 
264 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
265 {
266   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
267   PetscInt                           n                  = A->rmap->n;
268   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
269   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
270   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
271   const MatScalar                   *aa = a->a, *v;
272   PetscInt                          *AiLo, *AjLo;
273   PetscInt                           i, nz, nzLower, offset, rowOffset;
274 
275   PetscFunctionBegin;
276   if (!n) PetscFunctionReturn(0);
277   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower = n + ai[n] - ai[1];
281       if (!loTriFactor) {
282         PetscScalar *AALo;
283 
284         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
285 
286         /* Allocate Space for the lower triangular matrix */
287         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
288         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
289 
290         /* Fill the lower triangular matrix */
291         AiLo[0]   = (PetscInt)0;
292         AiLo[n]   = nzLower;
293         AjLo[0]   = (PetscInt)0;
294         AALo[0]   = (MatScalar)1.0;
295         v         = aa;
296         vi        = aj;
297         offset    = 1;
298         rowOffset = 1;
299         for (i = 1; i < n; i++) {
300           nz = ai[i + 1] - ai[i];
301           /* additional 1 for the term on the diagonal */
302           AiLo[i] = rowOffset;
303           rowOffset += nz + 1;
304 
305           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
306           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
307 
308           offset += nz;
309           AjLo[offset] = (PetscInt)i;
310           AALo[offset] = (MatScalar)1.0;
311           offset += 1;
312 
313           v += nz;
314           vi += nz;
315         }
316 
317         /* allocate space for the triangular factor information */
318         PetscCall(PetscNew(&loTriFactor));
319         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
320         /* Create the matrix description */
321         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
322         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
323 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
324         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
325 #else
326         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
327 #endif
328         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
329         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
330 
331         /* set the operation */
332         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
333 
334         /* set the matrix */
335         loTriFactor->csrMat              = new CsrMatrix;
336         loTriFactor->csrMat->num_rows    = n;
337         loTriFactor->csrMat->num_cols    = n;
338         loTriFactor->csrMat->num_entries = nzLower;
339 
340         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
341         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
342 
343         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
344         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
345 
346         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
347         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
348 
349         /* Create the solve analysis information */
350         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
351         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
352 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
353         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
354                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
355         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
356 #endif
357 
358         /* perform the solve analysis */
359         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
360                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
361         PetscCallCUDA(WaitForCUDA());
362         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
363 
364         /* assign the pointer */
365         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
366         loTriFactor->AA_h                                          = AALo;
367         PetscCallCUDA(cudaFreeHost(AiLo));
368         PetscCallCUDA(cudaFreeHost(AjLo));
369         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
370       } else { /* update values only */
371         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
372         /* Fill the lower triangular matrix */
373         loTriFactor->AA_h[0] = 1.0;
374         v                    = aa;
375         vi                   = aj;
376         offset               = 1;
377         for (i = 1; i < n; i++) {
378           nz = ai[i + 1] - ai[i];
379           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
380           offset += nz;
381           loTriFactor->AA_h[offset] = 1.0;
382           offset += 1;
383           v += nz;
384         }
385         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
386         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
387       }
388     } catch (char *ex) {
389       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
390     }
391   }
392   PetscFunctionReturn(0);
393 }
394 
395 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
396 {
397   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
398   PetscInt                           n                  = A->rmap->n;
399   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
400   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
401   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
402   const MatScalar                   *aa = a->a, *v;
403   PetscInt                          *AiUp, *AjUp;
404   PetscInt                           i, nz, nzUpper, offset;
405 
406   PetscFunctionBegin;
407   if (!n) PetscFunctionReturn(0);
408   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
409     try {
410       /* next, figure out the number of nonzeros in the upper triangular matrix. */
411       nzUpper = adiag[0] - adiag[n];
412       if (!upTriFactor) {
413         PetscScalar *AAUp;
414 
415         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
416 
417         /* Allocate Space for the upper triangular matrix */
418         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
419         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
420 
421         /* Fill the upper triangular matrix */
422         AiUp[0] = (PetscInt)0;
423         AiUp[n] = nzUpper;
424         offset  = nzUpper;
425         for (i = n - 1; i >= 0; i--) {
426           v  = aa + adiag[i + 1] + 1;
427           vi = aj + adiag[i + 1] + 1;
428 
429           /* number of elements NOT on the diagonal */
430           nz = adiag[i] - adiag[i + 1] - 1;
431 
432           /* decrement the offset */
433           offset -= (nz + 1);
434 
435           /* first, set the diagonal elements */
436           AjUp[offset] = (PetscInt)i;
437           AAUp[offset] = (MatScalar)1. / v[nz];
438           AiUp[i]      = AiUp[i + 1] - (nz + 1);
439 
440           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
441           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
442         }
443 
444         /* allocate space for the triangular factor information */
445         PetscCall(PetscNew(&upTriFactor));
446         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
447 
448         /* Create the matrix description */
449         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
450         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
451 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
452         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
453 #else
454         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
455 #endif
456         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
457         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
458 
459         /* set the operation */
460         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
461 
462         /* set the matrix */
463         upTriFactor->csrMat              = new CsrMatrix;
464         upTriFactor->csrMat->num_rows    = n;
465         upTriFactor->csrMat->num_cols    = n;
466         upTriFactor->csrMat->num_entries = nzUpper;
467 
468         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
469         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
470 
471         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
472         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
473 
474         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
475         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
476 
477         /* Create the solve analysis information */
478         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
479         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
480 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
481         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
482                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
483         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
484 #endif
485 
486         /* perform the solve analysis */
487         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
488                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
489 
490         PetscCallCUDA(WaitForCUDA());
491         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
492 
493         /* assign the pointer */
494         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
495         upTriFactor->AA_h                                          = AAUp;
496         PetscCallCUDA(cudaFreeHost(AiUp));
497         PetscCallCUDA(cudaFreeHost(AjUp));
498         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
499       } else {
500         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
501         /* Fill the upper triangular matrix */
502         offset = nzUpper;
503         for (i = n - 1; i >= 0; i--) {
504           v = aa + adiag[i + 1] + 1;
505 
506           /* number of elements NOT on the diagonal */
507           nz = adiag[i] - adiag[i + 1] - 1;
508 
509           /* decrement the offset */
510           offset -= (nz + 1);
511 
512           /* first, set the diagonal elements */
513           upTriFactor->AA_h[offset] = 1. / v[nz];
514           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
515         }
516         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
517         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
518       }
519     } catch (char *ex) {
520       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
521     }
522   }
523   PetscFunctionReturn(0);
524 }
525 
526 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
527 {
528   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
529   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
530   IS                            isrow = a->row, iscol = a->icol;
531   PetscBool                     row_identity, col_identity;
532   PetscInt                      n = A->rmap->n;
533 
534   PetscFunctionBegin;
535   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
536   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
537   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
538 
539   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
540   cusparseTriFactors->nnz = a->nz;
541 
542   A->offloadmask = PETSC_OFFLOAD_BOTH;
543   /* lower triangular indices */
544   PetscCall(ISIdentity(isrow, &row_identity));
545   if (!row_identity && !cusparseTriFactors->rpermIndices) {
546     const PetscInt *r;
547 
548     PetscCall(ISGetIndices(isrow, &r));
549     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
550     cusparseTriFactors->rpermIndices->assign(r, r + n);
551     PetscCall(ISRestoreIndices(isrow, &r));
552     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
553   }
554 
555   /* upper triangular indices */
556   PetscCall(ISIdentity(iscol, &col_identity));
557   if (!col_identity && !cusparseTriFactors->cpermIndices) {
558     const PetscInt *c;
559 
560     PetscCall(ISGetIndices(iscol, &c));
561     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
562     cusparseTriFactors->cpermIndices->assign(c, c + n);
563     PetscCall(ISRestoreIndices(iscol, &c));
564     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
565   }
566   PetscFunctionReturn(0);
567 }
568 
569 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
570 {
571   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
572   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
573   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
574   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
575   PetscInt                          *AiUp, *AjUp;
576   PetscScalar                       *AAUp;
577   PetscScalar                       *AALo;
578   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
579   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
580   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
581   const MatScalar                   *aa = b->a, *v;
582 
583   PetscFunctionBegin;
584   if (!n) PetscFunctionReturn(0);
585   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
586     try {
587       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
588       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
589       if (!upTriFactor && !loTriFactor) {
590         /* Allocate Space for the upper triangular matrix */
591         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
592         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
593 
594         /* Fill the upper triangular matrix */
595         AiUp[0] = (PetscInt)0;
596         AiUp[n] = nzUpper;
597         offset  = 0;
598         for (i = 0; i < n; i++) {
599           /* set the pointers */
600           v  = aa + ai[i];
601           vj = aj + ai[i];
602           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
603 
604           /* first, set the diagonal elements */
605           AjUp[offset] = (PetscInt)i;
606           AAUp[offset] = (MatScalar)1.0 / v[nz];
607           AiUp[i]      = offset;
608           AALo[offset] = (MatScalar)1.0 / v[nz];
609 
610           offset += 1;
611           if (nz > 0) {
612             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
613             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
614             for (j = offset; j < offset + nz; j++) {
615               AAUp[j] = -AAUp[j];
616               AALo[j] = AAUp[j] / v[nz];
617             }
618             offset += nz;
619           }
620         }
621 
622         /* allocate space for the triangular factor information */
623         PetscCall(PetscNew(&upTriFactor));
624         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
625 
626         /* Create the matrix description */
627         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
628         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
629 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
630         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
631 #else
632         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
633 #endif
634         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
635         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
636 
637         /* set the matrix */
638         upTriFactor->csrMat              = new CsrMatrix;
639         upTriFactor->csrMat->num_rows    = A->rmap->n;
640         upTriFactor->csrMat->num_cols    = A->cmap->n;
641         upTriFactor->csrMat->num_entries = a->nz;
642 
643         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
644         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
645 
646         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
647         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
648 
649         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
650         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
651 
652         /* set the operation */
653         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
654 
655         /* Create the solve analysis information */
656         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
657         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
658 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
659         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
660                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
661         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
662 #endif
663 
664         /* perform the solve analysis */
665         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
666                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
667 
668         PetscCallCUDA(WaitForCUDA());
669         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
670 
671         /* assign the pointer */
672         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
673 
674         /* allocate space for the triangular factor information */
675         PetscCall(PetscNew(&loTriFactor));
676         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
677 
678         /* Create the matrix description */
679         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
680         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
681 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
682         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
683 #else
684         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
685 #endif
686         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
687         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
688 
689         /* set the operation */
690         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
691 
692         /* set the matrix */
693         loTriFactor->csrMat              = new CsrMatrix;
694         loTriFactor->csrMat->num_rows    = A->rmap->n;
695         loTriFactor->csrMat->num_cols    = A->cmap->n;
696         loTriFactor->csrMat->num_entries = a->nz;
697 
698         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
699         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
700 
701         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
702         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
703 
704         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
705         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706 
707         /* Create the solve analysis information */
708         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
709         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
710 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
711         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
712                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
713         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
714 #endif
715 
716         /* perform the solve analysis */
717         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
718                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
719 
720         PetscCallCUDA(WaitForCUDA());
721         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
722 
723         /* assign the pointer */
724         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
725 
726         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
727         PetscCallCUDA(cudaFreeHost(AiUp));
728         PetscCallCUDA(cudaFreeHost(AjUp));
729       } else {
730         /* Fill the upper triangular matrix */
731         offset = 0;
732         for (i = 0; i < n; i++) {
733           /* set the pointers */
734           v  = aa + ai[i];
735           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
736 
737           /* first, set the diagonal elements */
738           AAUp[offset] = 1.0 / v[nz];
739           AALo[offset] = 1.0 / v[nz];
740 
741           offset += 1;
742           if (nz > 0) {
743             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
744             for (j = offset; j < offset + nz; j++) {
745               AAUp[j] = -AAUp[j];
746               AALo[j] = AAUp[j] / v[nz];
747             }
748             offset += nz;
749           }
750         }
751         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
752         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
753         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
754         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
755         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
756       }
757       PetscCallCUDA(cudaFreeHost(AAUp));
758       PetscCallCUDA(cudaFreeHost(AALo));
759     } catch (char *ex) {
760       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
761     }
762   }
763   PetscFunctionReturn(0);
764 }
765 
766 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
767 {
768   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
769   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
770   IS                            ip                 = a->row;
771   PetscBool                     perm_identity;
772   PetscInt                      n = A->rmap->n;
773 
774   PetscFunctionBegin;
775   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
776   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
777   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
778   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
779 
780   A->offloadmask = PETSC_OFFLOAD_BOTH;
781 
782   /* lower triangular indices */
783   PetscCall(ISIdentity(ip, &perm_identity));
784   if (!perm_identity) {
785     IS              iip;
786     const PetscInt *irip, *rip;
787 
788     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
789     PetscCall(ISGetIndices(iip, &irip));
790     PetscCall(ISGetIndices(ip, &rip));
791     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
792     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
793     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
794     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
795     PetscCall(ISRestoreIndices(iip, &irip));
796     PetscCall(ISDestroy(&iip));
797     PetscCall(ISRestoreIndices(ip, &rip));
798     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
799   }
800   PetscFunctionReturn(0);
801 }
802 
803 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
804 {
805   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
806   IS          ip = b->row;
807   PetscBool   perm_identity;
808 
809   PetscFunctionBegin;
810   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
811   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
812   B->offloadmask = PETSC_OFFLOAD_CPU;
813   /* determine which version of MatSolve needs to be used. */
814   PetscCall(ISIdentity(ip, &perm_identity));
815   if (perm_identity) {
816     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
817     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
818     B->ops->matsolve          = NULL;
819     B->ops->matsolvetranspose = NULL;
820   } else {
821     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
822     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
823     B->ops->matsolve          = NULL;
824     B->ops->matsolvetranspose = NULL;
825   }
826 
827   /* get the triangular factors */
828   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
829   PetscFunctionReturn(0);
830 }
831 
832 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
833 {
834   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
835   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
836   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
837   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
838   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
839   cusparseIndexBase_t                indexBase;
840   cusparseMatrixType_t               matrixType;
841   cusparseFillMode_t                 fillMode;
842   cusparseDiagType_t                 diagType;
843 
844   PetscFunctionBegin;
845   /* allocate space for the transpose of the lower triangular factor */
846   PetscCall(PetscNew(&loTriFactorT));
847   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
848 
849   /* set the matrix descriptors of the lower triangular factor */
850   matrixType = cusparseGetMatType(loTriFactor->descr);
851   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
852   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
853   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
854 
855   /* Create the matrix description */
856   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
857   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
858   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
859   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
860   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
861 
862   /* set the operation */
863   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
864 
865   /* allocate GPU space for the CSC of the lower triangular factor*/
866   loTriFactorT->csrMat                 = new CsrMatrix;
867   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
868   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
869   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
870   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
871   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
872   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
873 
874   /* compute the transpose of the lower triangular factor, i.e. the CSC */
875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
876   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
877                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
878                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
879   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
880 #endif
881 
882   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
883   {
884     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
885     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
886                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
887 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
888                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
889 #else
890                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
891 #endif
892     PetscCallCUSPARSE(stat);
893   }
894 
895   PetscCallCUDA(WaitForCUDA());
896   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
897 
898   /* Create the solve analysis information */
899   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
900   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
901 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
902   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
903                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
904   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
905 #endif
906 
907   /* perform the solve analysis */
908   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
909                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
910 
911   PetscCallCUDA(WaitForCUDA());
912   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913 
914   /* assign the pointer */
915   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
916 
917   /*********************************************/
918   /* Now the Transpose of the Upper Tri Factor */
919   /*********************************************/
920 
921   /* allocate space for the transpose of the upper triangular factor */
922   PetscCall(PetscNew(&upTriFactorT));
923   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
924 
925   /* set the matrix descriptors of the upper triangular factor */
926   matrixType = cusparseGetMatType(upTriFactor->descr);
927   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
928   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
929   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
930 
931   /* Create the matrix description */
932   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
933   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
934   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
935   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
936   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
937 
938   /* set the operation */
939   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
940 
941   /* allocate GPU space for the CSC of the upper triangular factor*/
942   upTriFactorT->csrMat                 = new CsrMatrix;
943   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
944   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
945   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
946   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
947   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
948   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
949 
950   /* compute the transpose of the upper triangular factor, i.e. the CSC */
951 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
952   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
953                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
954                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
955   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
956 #endif
957 
958   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
959   {
960     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
961     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
962                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
963 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
964                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
965 #else
966                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
967 #endif
968     PetscCallCUSPARSE(stat);
969   }
970 
971   PetscCallCUDA(WaitForCUDA());
972   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
973 
974   /* Create the solve analysis information */
975   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
976   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
977 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
978   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
979                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
980   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
981 #endif
982 
983   /* perform the solve analysis */
984   /* christ, would it have killed you to put this stuff in a function????????? */
985   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
986                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
987 
988   PetscCallCUDA(WaitForCUDA());
989   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
990 
991   /* assign the pointer */
992   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
993   PetscFunctionReturn(0);
994 }
995 
996 struct PetscScalarToPetscInt {
997   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
998 };
999 
1000 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1001 {
1002   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1003   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1004   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1005   cusparseStatus_t              stat;
1006   cusparseIndexBase_t           indexBase;
1007 
1008   PetscFunctionBegin;
1009   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1010   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1011   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1012   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1013   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1014   if (A->transupdated) PetscFunctionReturn(0);
1015   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1016   PetscCall(PetscLogGpuTimeBegin());
1017   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1018   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1019     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1020     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1021     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1022     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1023     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1024 
1025     /* set alpha and beta */
1026     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1027     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1028     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1029     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1030     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1031     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1032 
1033     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1034       CsrMatrix *matrixT      = new CsrMatrix;
1035       matstructT->mat         = matrixT;
1036       matrixT->num_rows       = A->cmap->n;
1037       matrixT->num_cols       = A->rmap->n;
1038       matrixT->num_entries    = a->nz;
1039       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1040       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1041       matrixT->values         = new THRUSTARRAY(a->nz);
1042 
1043       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1044       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1045 
1046 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1047   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1048       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1049                                indexBase, cusparse_scalartype);
1050       PetscCallCUSPARSE(stat);
1051   #else
1052       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1053            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1054 
1055            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1056            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1057            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1058         */
1059       if (matrixT->num_entries) {
1060         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1061         PetscCallCUSPARSE(stat);
1062 
1063       } else {
1064         matstructT->matDescr = NULL;
1065         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1066       }
1067   #endif
1068 #endif
1069     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1070 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1071       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1072 #else
1073       CsrMatrix *temp = new CsrMatrix;
1074       CsrMatrix *tempT = new CsrMatrix;
1075       /* First convert HYB to CSR */
1076       temp->num_rows = A->rmap->n;
1077       temp->num_cols = A->cmap->n;
1078       temp->num_entries = a->nz;
1079       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1080       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1081       temp->values = new THRUSTARRAY(a->nz);
1082 
1083       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1084       PetscCallCUSPARSE(stat);
1085 
1086       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1087       tempT->num_rows = A->rmap->n;
1088       tempT->num_cols = A->cmap->n;
1089       tempT->num_entries = a->nz;
1090       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1091       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1092       tempT->values = new THRUSTARRAY(a->nz);
1093 
1094       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1095                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1096       PetscCallCUSPARSE(stat);
1097 
1098       /* Last, convert CSC to HYB */
1099       cusparseHybMat_t hybMat;
1100       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1101       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1102       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1103       PetscCallCUSPARSE(stat);
1104 
1105       /* assign the pointer */
1106       matstructT->mat = hybMat;
1107       A->transupdated = PETSC_TRUE;
1108       /* delete temporaries */
1109       if (tempT) {
1110         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1111         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1112         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1113         delete (CsrMatrix *)tempT;
1114       }
1115       if (temp) {
1116         if (temp->values) delete (THRUSTARRAY *)temp->values;
1117         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1118         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1119         delete (CsrMatrix *)temp;
1120       }
1121 #endif
1122     }
1123   }
1124   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1125     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1126     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1127     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1128     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1129     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1130     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1131     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1132     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1133     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1134     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1135     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1136       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1137       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1138       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1139     }
1140     if (!cusparsestruct->csr2csc_i) {
1141       THRUSTARRAY csr2csc_a(matrix->num_entries);
1142       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1143 
1144       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1145 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1146       void  *csr2cscBuffer;
1147       size_t csr2cscBufferSize;
1148       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1149                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1150       PetscCallCUSPARSE(stat);
1151       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1152 #endif
1153 
1154       if (matrix->num_entries) {
1155         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1156            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1157            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1158 
1159            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1160            should be filled with indexBase. So I just take a shortcut here.
1161         */
1162         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1163 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1164                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1165         PetscCallCUSPARSE(stat);
1166 #else
1167                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1168         PetscCallCUSPARSE(stat);
1169 #endif
1170       } else {
1171         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1172       }
1173 
1174       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1175       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1177       PetscCallCUDA(cudaFree(csr2cscBuffer));
1178 #endif
1179     }
1180     PetscCallThrust(
1181       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1182   }
1183   PetscCall(PetscLogGpuTimeEnd());
1184   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1185   /* the compressed row indices is not used for matTranspose */
1186   matstructT->cprowIndices = NULL;
1187   /* assign the pointer */
1188   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1189   A->transupdated                                = PETSC_TRUE;
1190   PetscFunctionReturn(0);
1191 }
1192 
1193 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1194 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1195 {
1196   PetscInt                              n = xx->map->n;
1197   const PetscScalar                    *barray;
1198   PetscScalar                          *xarray;
1199   thrust::device_ptr<const PetscScalar> bGPU;
1200   thrust::device_ptr<PetscScalar>       xGPU;
1201   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1202   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1203   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1204   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1205 
1206   PetscFunctionBegin;
1207   /* Analyze the matrix and create the transpose ... on the fly */
1208   if (!loTriFactorT && !upTriFactorT) {
1209     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1210     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1211     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1212   }
1213 
1214   /* Get the GPU pointers */
1215   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1216   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1217   xGPU = thrust::device_pointer_cast(xarray);
1218   bGPU = thrust::device_pointer_cast(barray);
1219 
1220   PetscCall(PetscLogGpuTimeBegin());
1221   /* First, reorder with the row permutation */
1222   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1223 
1224   /* First, solve U */
1225   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1226                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1227 
1228   /* Then, solve L */
1229   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1230                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1231 
1232   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1233   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1234 
1235   /* Copy the temporary to the full solution. */
1236   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1237 
1238   /* restore */
1239   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1240   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1241   PetscCall(PetscLogGpuTimeEnd());
1242   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1247 {
1248   const PetscScalar                 *barray;
1249   PetscScalar                       *xarray;
1250   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1251   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1252   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1253   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1254 
1255   PetscFunctionBegin;
1256   /* Analyze the matrix and create the transpose ... on the fly */
1257   if (!loTriFactorT && !upTriFactorT) {
1258     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1259     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1260     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1261   }
1262 
1263   /* Get the GPU pointers */
1264   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1265   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1266 
1267   PetscCall(PetscLogGpuTimeBegin());
1268   /* First, solve U */
1269   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1270                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1271 
1272   /* Then, solve L */
1273   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1274                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1275 
1276   /* restore */
1277   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1278   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1279   PetscCall(PetscLogGpuTimeEnd());
1280   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1281   PetscFunctionReturn(0);
1282 }
1283 
1284 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1285 {
1286   const PetscScalar                    *barray;
1287   PetscScalar                          *xarray;
1288   thrust::device_ptr<const PetscScalar> bGPU;
1289   thrust::device_ptr<PetscScalar>       xGPU;
1290   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1291   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1292   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1293   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1294 
1295   PetscFunctionBegin;
1296   /* Get the GPU pointers */
1297   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1298   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1299   xGPU = thrust::device_pointer_cast(xarray);
1300   bGPU = thrust::device_pointer_cast(barray);
1301 
1302   PetscCall(PetscLogGpuTimeBegin());
1303   /* First, reorder with the row permutation */
1304   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1305 
1306   /* Next, solve L */
1307   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1308                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1309 
1310   /* Then, solve U */
1311   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1312                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1313 
1314   /* Last, reorder with the column permutation */
1315   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1316 
1317   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1318   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1319   PetscCall(PetscLogGpuTimeEnd());
1320   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1325 {
1326   const PetscScalar                 *barray;
1327   PetscScalar                       *xarray;
1328   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1330   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1331   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1332 
1333   PetscFunctionBegin;
1334   /* Get the GPU pointers */
1335   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1336   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1337 
1338   PetscCall(PetscLogGpuTimeBegin());
1339   /* First, solve L */
1340   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1341                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1342 
1343   /* Next, solve U */
1344   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1345                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1346 
1347   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1348   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1349   PetscCall(PetscLogGpuTimeEnd());
1350   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1351   PetscFunctionReturn(0);
1352 }
1353 
1354 #if CUSPARSE_VERSION >= 11500
1355 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1356 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1357 {
1358   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1359   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1360   const PetscScalar            *barray;
1361   PetscScalar                  *xarray;
1362 
1363   PetscFunctionBegin;
1364   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1365   PetscCall(VecCUDAGetArrayRead(b, &barray));
1366   PetscCall(PetscLogGpuTimeBegin());
1367 
1368   /* Solve L*y = b */
1369   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1370   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1371   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1372                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1373                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1374 
1375   /* Solve U*x = y */
1376   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1377   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1378                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1379 
1380   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1381   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1382 
1383   PetscCall(PetscLogGpuTimeEnd());
1384   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1385   PetscFunctionReturn(0);
1386 }
1387 
1388 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1389 {
1390   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1391   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1392   const PetscScalar            *barray;
1393   PetscScalar                  *xarray;
1394 
1395   PetscFunctionBegin;
1396   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1397     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1398     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1399                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1400 
1401     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1402     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1403     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1404     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1405     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1406   }
1407 
1408   if (!fs->updatedTransposeSpSVAnalysis) {
1409     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1410 
1411     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1412     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1413   }
1414 
1415   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1416   PetscCall(VecCUDAGetArrayRead(b, &barray));
1417   PetscCall(PetscLogGpuTimeBegin());
1418 
1419   /* Solve Ut*y = b */
1420   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1421   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1422   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1423                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1424 
1425   /* Solve Lt*x = y */
1426   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1427   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1428                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1429 
1430   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1431   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1432   PetscCall(PetscLogGpuTimeEnd());
1433   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1438 {
1439   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1440   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1441   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1442   CsrMatrix                    *Acsr;
1443   PetscInt                      m, nz;
1444   PetscBool                     flg;
1445 
1446   PetscFunctionBegin;
1447   if (PetscDefined(USE_DEBUG)) {
1448     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1449     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1450   }
1451 
1452   /* Copy A's value to fact */
1453   m  = fact->rmap->n;
1454   nz = aij->nz;
1455   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1456   Acsr = (CsrMatrix *)Acusp->mat->mat;
1457   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1458 
1459   /* Factorize fact inplace */
1460   if (m)
1461     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1462                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1463   if (PetscDefined(USE_DEBUG)) {
1464     int              numerical_zero;
1465     cusparseStatus_t status;
1466     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1467     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1468   }
1469 
1470   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1471      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1472   */
1473   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1474 
1475   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1476 
1477   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1478   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1479 
1480   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1481   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1482   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1483   fact->ops->matsolve          = NULL;
1484   fact->ops->matsolvetranspose = NULL;
1485   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1490 {
1491   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1492   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1493   PetscInt                      m, nz;
1494 
1495   PetscFunctionBegin;
1496   if (PetscDefined(USE_DEBUG)) {
1497     PetscInt  i;
1498     PetscBool flg, missing;
1499 
1500     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1501     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1502     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1503     PetscCall(MatMissingDiagonal(A, &missing, &i));
1504     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1505   }
1506 
1507   /* Free the old stale stuff */
1508   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1509 
1510   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1511      but they will not be used. Allocate them just for easy debugging.
1512    */
1513   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1514 
1515   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1516   fact->factortype             = MAT_FACTOR_ILU;
1517   fact->info.factor_mallocs    = 0;
1518   fact->info.fill_ratio_given  = info->fill;
1519   fact->info.fill_ratio_needed = 1.0;
1520 
1521   aij->row = NULL;
1522   aij->col = NULL;
1523 
1524   /* ====================================================================== */
1525   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1526   /* We'll do in-place factorization on fact                                */
1527   /* ====================================================================== */
1528   const int *Ai, *Aj;
1529 
1530   m  = fact->rmap->n;
1531   nz = aij->nz;
1532 
1533   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1534   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1535   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1536   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1537   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1538   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1539 
1540   /* ====================================================================== */
1541   /* Create descriptors for M, L, U                                         */
1542   /* ====================================================================== */
1543   cusparseFillMode_t fillMode;
1544   cusparseDiagType_t diagType;
1545 
1546   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1547   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1548   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1549 
1550   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1551     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1552     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1553     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1554     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1555   */
1556   fillMode = CUSPARSE_FILL_MODE_LOWER;
1557   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1558   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1559   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1560   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1561 
1562   fillMode = CUSPARSE_FILL_MODE_UPPER;
1563   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1564   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1565   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1566   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1567 
1568   /* ========================================================================= */
1569   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1570   /* ========================================================================= */
1571   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1572   if (m)
1573     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1574                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1575 
1576   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1577   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1578 
1579   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1580   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1581 
1582   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1583   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1584 
1585   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1586   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1587 
1588   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1589      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1590      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1591      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1592    */
1593   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1594     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1595     fs->spsvBuffer_L = fs->factBuffer_M;
1596     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1597   } else {
1598     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1599     fs->spsvBuffer_U = fs->factBuffer_M;
1600     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1601   }
1602 
1603   /* ========================================================================== */
1604   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1605   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1606   /* ========================================================================== */
1607   int              structural_zero;
1608   cusparseStatus_t status;
1609 
1610   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1611   if (m)
1612     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1613                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1614   if (PetscDefined(USE_DEBUG)) {
1615     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1616     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1617     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1618   }
1619 
1620   /* Estimate FLOPs of the numeric factorization */
1621   {
1622     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1623     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1624     PetscLogDouble flops = 0.0;
1625 
1626     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1627     Ai    = Aseq->i;
1628     Adiag = Aseq->diag;
1629     for (PetscInt i = 0; i < m; i++) {
1630       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1631         nzRow  = Ai[i + 1] - Ai[i];
1632         nzLeft = Adiag[i] - Ai[i];
1633         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1634           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1635         */
1636         nzLeft = (nzRow - 1) / 2;
1637         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1638       }
1639     }
1640     fs->numericFactFlops = flops;
1641   }
1642   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1647 {
1648   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1649   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1650   const PetscScalar            *barray;
1651   PetscScalar                  *xarray;
1652 
1653   PetscFunctionBegin;
1654   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1655   PetscCall(VecCUDAGetArrayRead(b, &barray));
1656   PetscCall(PetscLogGpuTimeBegin());
1657 
1658   /* Solve L*y = b */
1659   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1660   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1661   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1662                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1663 
1664   /* Solve Lt*x = y */
1665   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1666   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1667                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1668 
1669   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1670   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1671 
1672   PetscCall(PetscLogGpuTimeEnd());
1673   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1674   PetscFunctionReturn(0);
1675 }
1676 
1677 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1678 {
1679   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1680   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1681   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1682   CsrMatrix                    *Acsr;
1683   PetscInt                      m, nz;
1684   PetscBool                     flg;
1685 
1686   PetscFunctionBegin;
1687   if (PetscDefined(USE_DEBUG)) {
1688     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1689     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1690   }
1691 
1692   /* Copy A's value to fact */
1693   m  = fact->rmap->n;
1694   nz = aij->nz;
1695   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1696   Acsr = (CsrMatrix *)Acusp->mat->mat;
1697   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1698 
1699   /* Factorize fact inplace */
1700   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1701      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1702      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1703      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1704      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1705    */
1706   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1707   if (PetscDefined(USE_DEBUG)) {
1708     int              numerical_zero;
1709     cusparseStatus_t status;
1710     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1711     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1712   }
1713 
1714   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1715 
1716   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1717     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1718   */
1719   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1720 
1721   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1722   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1723   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1724   fact->ops->matsolve          = NULL;
1725   fact->ops->matsolvetranspose = NULL;
1726   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1727   PetscFunctionReturn(0);
1728 }
1729 
1730 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1731 {
1732   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1733   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1734   PetscInt                      m, nz;
1735 
1736   PetscFunctionBegin;
1737   if (PetscDefined(USE_DEBUG)) {
1738     PetscInt  i;
1739     PetscBool flg, missing;
1740 
1741     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1742     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1743     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1744     PetscCall(MatMissingDiagonal(A, &missing, &i));
1745     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1746   }
1747 
1748   /* Free the old stale stuff */
1749   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1750 
1751   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1752      but they will not be used. Allocate them just for easy debugging.
1753    */
1754   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1755 
1756   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1757   fact->factortype             = MAT_FACTOR_ICC;
1758   fact->info.factor_mallocs    = 0;
1759   fact->info.fill_ratio_given  = info->fill;
1760   fact->info.fill_ratio_needed = 1.0;
1761 
1762   aij->row = NULL;
1763   aij->col = NULL;
1764 
1765   /* ====================================================================== */
1766   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1767   /* We'll do in-place factorization on fact                                */
1768   /* ====================================================================== */
1769   const int *Ai, *Aj;
1770 
1771   m  = fact->rmap->n;
1772   nz = aij->nz;
1773 
1774   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1775   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1776   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1777   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1778   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1779   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1780 
1781   /* ====================================================================== */
1782   /* Create mat descriptors for M, L                                        */
1783   /* ====================================================================== */
1784   cusparseFillMode_t fillMode;
1785   cusparseDiagType_t diagType;
1786 
1787   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1788   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1789   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1790 
1791   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1792     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1793     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1794     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1795     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1796   */
1797   fillMode = CUSPARSE_FILL_MODE_LOWER;
1798   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1799   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1800   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1801   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1802 
1803   /* ========================================================================= */
1804   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1805   /* ========================================================================= */
1806   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1807   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1808 
1809   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1810   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1811 
1812   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1813   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1814 
1815   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1816   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1817 
1818   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1819   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1820 
1821   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1822      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1823    */
1824   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1825     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1826     fs->spsvBuffer_L = fs->factBuffer_M;
1827     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1828   } else {
1829     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1830     fs->spsvBuffer_Lt = fs->factBuffer_M;
1831     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1832   }
1833 
1834   /* ========================================================================== */
1835   /* Perform analysis of ic0 on M                                               */
1836   /* The lower triangular part of M has the same sparsity pattern as L          */
1837   /* ========================================================================== */
1838   int              structural_zero;
1839   cusparseStatus_t status;
1840 
1841   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1842   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1843   if (PetscDefined(USE_DEBUG)) {
1844     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1845     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1846     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1847   }
1848 
1849   /* Estimate FLOPs of the numeric factorization */
1850   {
1851     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1852     PetscInt      *Ai, nzRow, nzLeft;
1853     PetscLogDouble flops = 0.0;
1854 
1855     Ai = Aseq->i;
1856     for (PetscInt i = 0; i < m; i++) {
1857       nzRow = Ai[i + 1] - Ai[i];
1858       if (nzRow > 1) {
1859         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1860           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1861         */
1862         nzLeft = (nzRow - 1) / 2;
1863         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1864       }
1865     }
1866     fs->numericFactFlops = flops;
1867   }
1868   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1869   PetscFunctionReturn(0);
1870 }
1871 #endif
1872 
1873 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1874 {
1875   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1876 
1877   PetscFunctionBegin;
1878 #if CUSPARSE_VERSION >= 11500
1879   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1880   if (cusparseTriFactors->factorizeOnDevice) {
1881     PetscCall(ISIdentity(isrow, &row_identity));
1882     PetscCall(ISIdentity(iscol, &col_identity));
1883   }
1884   if (!info->levels && row_identity && col_identity) {
1885     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1886   } else
1887 #endif
1888   {
1889     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1890     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1891     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1892   }
1893   PetscFunctionReturn(0);
1894 }
1895 
1896 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1897 {
1898   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1899 
1900   PetscFunctionBegin;
1901   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1902   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1903   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1908 {
1909   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1910 
1911   PetscFunctionBegin;
1912 #if CUSPARSE_VERSION >= 11500
1913   PetscBool perm_identity = PETSC_FALSE;
1914   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1915   if (!info->levels && perm_identity) {
1916     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1917   } else
1918 #endif
1919   {
1920     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1921     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1922     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1923   }
1924   PetscFunctionReturn(0);
1925 }
1926 
1927 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1928 {
1929   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1930 
1931   PetscFunctionBegin;
1932   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1933   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1934   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type)
1939 {
1940   PetscFunctionBegin;
1941   *type = MATSOLVERCUSPARSE;
1942   PetscFunctionReturn(0);
1943 }
1944 
1945 /*MC
1946   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1947   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1948   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1949   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1950   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1951   algorithms are not recommended. This class does NOT support direct solver operations.
1952 
1953   Level: beginner
1954 
1955 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1956 M*/
1957 
1958 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1959 {
1960   PetscInt  n = A->rmap->n;
1961   PetscBool factOnDevice, factOnHost;
1962   char     *prefix;
1963   char      factPlace[32] = "device"; /* the default */
1964 
1965   PetscFunctionBegin;
1966   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1967   PetscCall(MatSetSizes(*B, n, n, n, n));
1968   (*B)->factortype = ftype;
1969   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1970 
1971   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1972   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1973   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1974   PetscOptionsEnd();
1975   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1976   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1977   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1978   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1979 
1980   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1981   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1982     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1983     if (!A->boundtocpu) {
1984       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1985       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1986     } else {
1987       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1988       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1989     }
1990     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1991     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1992     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1993   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1994     if (!A->boundtocpu) {
1995       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1996       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1997     } else {
1998       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1999       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2000     }
2001     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2002     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2003   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2004 
2005   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2006   (*B)->canuseordering = PETSC_TRUE;
2007   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2008   PetscFunctionReturn(0);
2009 }
2010 
2011 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2012 {
2013   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2014   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2015 #if CUSPARSE_VERSION >= 13500
2016   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2017 #endif
2018 
2019   PetscFunctionBegin;
2020   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2021     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2022     if (A->factortype == MAT_FACTOR_NONE) {
2023       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2024       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2025     }
2026 #if CUSPARSE_VERSION >= 13500
2027     else if (fs->csrVal) {
2028       /* We have a factorized matrix on device and are able to copy it to host */
2029       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2030     }
2031 #endif
2032     else
2033       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2034     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2035     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2036     A->offloadmask = PETSC_OFFLOAD_BOTH;
2037   }
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2042 {
2043   PetscFunctionBegin;
2044   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2045   *array = ((Mat_SeqAIJ *)A->data)->a;
2046   PetscFunctionReturn(0);
2047 }
2048 
2049 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2050 {
2051   PetscFunctionBegin;
2052   A->offloadmask = PETSC_OFFLOAD_CPU;
2053   *array         = NULL;
2054   PetscFunctionReturn(0);
2055 }
2056 
2057 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2058 {
2059   PetscFunctionBegin;
2060   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2061   *array = ((Mat_SeqAIJ *)A->data)->a;
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2066 {
2067   PetscFunctionBegin;
2068   *array = NULL;
2069   PetscFunctionReturn(0);
2070 }
2071 
2072 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2073 {
2074   PetscFunctionBegin;
2075   *array = ((Mat_SeqAIJ *)A->data)->a;
2076   PetscFunctionReturn(0);
2077 }
2078 
2079 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2080 {
2081   PetscFunctionBegin;
2082   A->offloadmask = PETSC_OFFLOAD_CPU;
2083   *array         = NULL;
2084   PetscFunctionReturn(0);
2085 }
2086 
2087 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2088 {
2089   Mat_SeqAIJCUSPARSE *cusp;
2090   CsrMatrix          *matrix;
2091 
2092   PetscFunctionBegin;
2093   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2094   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2095   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2096   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2097   matrix = (CsrMatrix *)cusp->mat->mat;
2098 
2099   if (i) {
2100 #if !defined(PETSC_USE_64BIT_INDICES)
2101     *i = matrix->row_offsets->data().get();
2102 #else
2103     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2104 #endif
2105   }
2106   if (j) {
2107 #if !defined(PETSC_USE_64BIT_INDICES)
2108     *j = matrix->column_indices->data().get();
2109 #else
2110     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2111 #endif
2112   }
2113   if (a) *a = matrix->values->data().get();
2114   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2115   PetscFunctionReturn(0);
2116 }
2117 
2118 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2119 {
2120   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2121   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2122   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2123   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2124   cusparseStatus_t              stat;
2125   PetscBool                     both = PETSC_TRUE;
2126 
2127   PetscFunctionBegin;
2128   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2129   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2130     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2131       CsrMatrix *matrix;
2132       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2133 
2134       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2135       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2136       matrix->values->assign(a->a, a->a + a->nz);
2137       PetscCallCUDA(WaitForCUDA());
2138       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2139       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2140       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2141     } else {
2142       PetscInt nnz;
2143       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2144       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2145       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2146       delete cusparsestruct->workVector;
2147       delete cusparsestruct->rowoffsets_gpu;
2148       cusparsestruct->workVector     = NULL;
2149       cusparsestruct->rowoffsets_gpu = NULL;
2150       try {
2151         if (a->compressedrow.use) {
2152           m    = a->compressedrow.nrows;
2153           ii   = a->compressedrow.i;
2154           ridx = a->compressedrow.rindex;
2155         } else {
2156           m    = A->rmap->n;
2157           ii   = a->i;
2158           ridx = NULL;
2159         }
2160         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2161         if (!a->a) {
2162           nnz  = ii[m];
2163           both = PETSC_FALSE;
2164         } else nnz = a->nz;
2165         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2166 
2167         /* create cusparse matrix */
2168         cusparsestruct->nrows = m;
2169         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2170         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2171         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2172         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2173 
2174         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2175         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2176         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2177         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2178         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2179         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2180         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2181 
2182         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2183         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2184           /* set the matrix */
2185           CsrMatrix *mat   = new CsrMatrix;
2186           mat->num_rows    = m;
2187           mat->num_cols    = A->cmap->n;
2188           mat->num_entries = nnz;
2189           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2190           mat->row_offsets->assign(ii, ii + m + 1);
2191 
2192           mat->column_indices = new THRUSTINTARRAY32(nnz);
2193           mat->column_indices->assign(a->j, a->j + nnz);
2194 
2195           mat->values = new THRUSTARRAY(nnz);
2196           if (a->a) mat->values->assign(a->a, a->a + nnz);
2197 
2198           /* assign the pointer */
2199           matstruct->mat = mat;
2200 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2201           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2202             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2203                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2204             PetscCallCUSPARSE(stat);
2205           }
2206 #endif
2207         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2208 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2209           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2210 #else
2211           CsrMatrix *mat = new CsrMatrix;
2212           mat->num_rows = m;
2213           mat->num_cols = A->cmap->n;
2214           mat->num_entries = nnz;
2215           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2216           mat->row_offsets->assign(ii, ii + m + 1);
2217 
2218           mat->column_indices = new THRUSTINTARRAY32(nnz);
2219           mat->column_indices->assign(a->j, a->j + nnz);
2220 
2221           mat->values = new THRUSTARRAY(nnz);
2222           if (a->a) mat->values->assign(a->a, a->a + nnz);
2223 
2224           cusparseHybMat_t hybMat;
2225           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2226           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2227           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2228           PetscCallCUSPARSE(stat);
2229           /* assign the pointer */
2230           matstruct->mat = hybMat;
2231 
2232           if (mat) {
2233             if (mat->values) delete (THRUSTARRAY *)mat->values;
2234             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2235             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2236             delete (CsrMatrix *)mat;
2237           }
2238 #endif
2239         }
2240 
2241         /* assign the compressed row indices */
2242         if (a->compressedrow.use) {
2243           cusparsestruct->workVector = new THRUSTARRAY(m);
2244           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2245           matstruct->cprowIndices->assign(ridx, ridx + m);
2246           tmp = m;
2247         } else {
2248           cusparsestruct->workVector = NULL;
2249           matstruct->cprowIndices    = NULL;
2250           tmp                        = 0;
2251         }
2252         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2253 
2254         /* assign the pointer */
2255         cusparsestruct->mat = matstruct;
2256       } catch (char *ex) {
2257         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2258       }
2259       PetscCallCUDA(WaitForCUDA());
2260       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2261       cusparsestruct->nonzerostate = A->nonzerostate;
2262     }
2263     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2264   }
2265   PetscFunctionReturn(0);
2266 }
2267 
2268 struct VecCUDAPlusEquals {
2269   template <typename Tuple>
2270   __host__ __device__ void operator()(Tuple t)
2271   {
2272     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2273   }
2274 };
2275 
2276 struct VecCUDAEquals {
2277   template <typename Tuple>
2278   __host__ __device__ void operator()(Tuple t)
2279   {
2280     thrust::get<1>(t) = thrust::get<0>(t);
2281   }
2282 };
2283 
2284 struct VecCUDAEqualsReverse {
2285   template <typename Tuple>
2286   __host__ __device__ void operator()(Tuple t)
2287   {
2288     thrust::get<0>(t) = thrust::get<1>(t);
2289   }
2290 };
2291 
2292 struct MatMatCusparse {
2293   PetscBool      cisdense;
2294   PetscScalar   *Bt;
2295   Mat            X;
2296   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2297   PetscLogDouble flops;
2298   CsrMatrix     *Bcsr;
2299 
2300 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2301   cusparseSpMatDescr_t matSpBDescr;
2302   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2303   cusparseDnMatDescr_t matBDescr;
2304   cusparseDnMatDescr_t matCDescr;
2305   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2306   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2307   void *dBuffer4;
2308   void *dBuffer5;
2309   #endif
2310   size_t                mmBufferSize;
2311   void                 *mmBuffer;
2312   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2313   cusparseSpGEMMDescr_t spgemmDesc;
2314 #endif
2315 };
2316 
2317 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2318 {
2319   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2320 
2321   PetscFunctionBegin;
2322   PetscCallCUDA(cudaFree(mmdata->Bt));
2323   delete mmdata->Bcsr;
2324 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2325   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2326   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2327   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2328   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2329   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2330   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2331   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2332   #endif
2333   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2334   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2335 #endif
2336   PetscCall(MatDestroy(&mmdata->X));
2337   PetscCall(PetscFree(data));
2338   PetscFunctionReturn(0);
2339 }
2340 
2341 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2342 
2343 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2344 {
2345   Mat_Product                  *product = C->product;
2346   Mat                           A, B;
2347   PetscInt                      m, n, blda, clda;
2348   PetscBool                     flg, biscuda;
2349   Mat_SeqAIJCUSPARSE           *cusp;
2350   cusparseStatus_t              stat;
2351   cusparseOperation_t           opA;
2352   const PetscScalar            *barray;
2353   PetscScalar                  *carray;
2354   MatMatCusparse               *mmdata;
2355   Mat_SeqAIJCUSPARSEMultStruct *mat;
2356   CsrMatrix                    *csrmat;
2357 
2358   PetscFunctionBegin;
2359   MatCheckProduct(C, 1);
2360   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2361   mmdata = (MatMatCusparse *)product->data;
2362   A      = product->A;
2363   B      = product->B;
2364   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2365   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2366   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2367      Instead of silently accepting the wrong answer, I prefer to raise the error */
2368   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2369   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2370   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2371   switch (product->type) {
2372   case MATPRODUCT_AB:
2373   case MATPRODUCT_PtAP:
2374     mat = cusp->mat;
2375     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2376     m   = A->rmap->n;
2377     n   = B->cmap->n;
2378     break;
2379   case MATPRODUCT_AtB:
2380     if (!A->form_explicit_transpose) {
2381       mat = cusp->mat;
2382       opA = CUSPARSE_OPERATION_TRANSPOSE;
2383     } else {
2384       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2385       mat = cusp->matTranspose;
2386       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2387     }
2388     m = A->cmap->n;
2389     n = B->cmap->n;
2390     break;
2391   case MATPRODUCT_ABt:
2392   case MATPRODUCT_RARt:
2393     mat = cusp->mat;
2394     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2395     m   = A->rmap->n;
2396     n   = B->rmap->n;
2397     break;
2398   default:
2399     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2400   }
2401   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2402   csrmat = (CsrMatrix *)mat->mat;
2403   /* if the user passed a CPU matrix, copy the data to the GPU */
2404   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2405   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2406   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2407 
2408   PetscCall(MatDenseGetLDA(B, &blda));
2409   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2410     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
2411     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2412   } else {
2413     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
2414     PetscCall(MatDenseGetLDA(C, &clda));
2415   }
2416 
2417   PetscCall(PetscLogGpuTimeBegin());
2418 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2419   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2420   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2421   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2422     size_t mmBufferSize;
2423     if (mmdata->initialized && mmdata->Blda != blda) {
2424       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2425       mmdata->matBDescr = NULL;
2426     }
2427     if (!mmdata->matBDescr) {
2428       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2429       mmdata->Blda = blda;
2430     }
2431 
2432     if (mmdata->initialized && mmdata->Clda != clda) {
2433       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2434       mmdata->matCDescr = NULL;
2435     }
2436     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2437       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2438       mmdata->Clda = clda;
2439     }
2440 
2441     if (!mat->matDescr) {
2442       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2443                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2444       PetscCallCUSPARSE(stat);
2445     }
2446     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2447     PetscCallCUSPARSE(stat);
2448     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2449       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2450       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2451       mmdata->mmBufferSize = mmBufferSize;
2452     }
2453     mmdata->initialized = PETSC_TRUE;
2454   } else {
2455     /* to be safe, always update pointers of the mats */
2456     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2457     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2458     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2459   }
2460 
2461   /* do cusparseSpMM, which supports transpose on B */
2462   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2463   PetscCallCUSPARSE(stat);
2464 #else
2465   PetscInt k;
2466   /* cusparseXcsrmm does not support transpose on B */
2467   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2468     cublasHandle_t cublasv2handle;
2469     cublasStatus_t cerr;
2470 
2471     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2472     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2473     PetscCallCUBLAS(cerr);
2474     blda = B->cmap->n;
2475     k = B->cmap->n;
2476   } else {
2477     k = B->rmap->n;
2478   }
2479 
2480   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2481   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2482   PetscCallCUSPARSE(stat);
2483 #endif
2484   PetscCall(PetscLogGpuTimeEnd());
2485   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2486   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2487   if (product->type == MATPRODUCT_RARt) {
2488     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2489     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2490   } else if (product->type == MATPRODUCT_PtAP) {
2491     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2492     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2493   } else {
2494     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2495   }
2496   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2497   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2498   PetscFunctionReturn(0);
2499 }
2500 
2501 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2502 {
2503   Mat_Product        *product = C->product;
2504   Mat                 A, B;
2505   PetscInt            m, n;
2506   PetscBool           cisdense, flg;
2507   MatMatCusparse     *mmdata;
2508   Mat_SeqAIJCUSPARSE *cusp;
2509 
2510   PetscFunctionBegin;
2511   MatCheckProduct(C, 1);
2512   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2513   A = product->A;
2514   B = product->B;
2515   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2516   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2517   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2518   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2519   switch (product->type) {
2520   case MATPRODUCT_AB:
2521     m = A->rmap->n;
2522     n = B->cmap->n;
2523     break;
2524   case MATPRODUCT_AtB:
2525     m = A->cmap->n;
2526     n = B->cmap->n;
2527     break;
2528   case MATPRODUCT_ABt:
2529     m = A->rmap->n;
2530     n = B->rmap->n;
2531     break;
2532   case MATPRODUCT_PtAP:
2533     m = B->cmap->n;
2534     n = B->cmap->n;
2535     break;
2536   case MATPRODUCT_RARt:
2537     m = B->rmap->n;
2538     n = B->rmap->n;
2539     break;
2540   default:
2541     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2542   }
2543   PetscCall(MatSetSizes(C, m, n, m, n));
2544   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2545   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2546   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2547 
2548   /* product data */
2549   PetscCall(PetscNew(&mmdata));
2550   mmdata->cisdense = cisdense;
2551 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2552   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2553   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2554 #endif
2555   /* for these products we need intermediate storage */
2556   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2557     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2558     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2559     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2560       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2561     } else {
2562       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2563     }
2564   }
2565   C->product->data    = mmdata;
2566   C->product->destroy = MatDestroy_MatMatCusparse;
2567 
2568   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2569   PetscFunctionReturn(0);
2570 }
2571 
2572 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2573 {
2574   Mat_Product                  *product = C->product;
2575   Mat                           A, B;
2576   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2577   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2578   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2579   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2580   PetscBool                     flg;
2581   cusparseStatus_t              stat;
2582   MatProductType                ptype;
2583   MatMatCusparse               *mmdata;
2584 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2585   cusparseSpMatDescr_t BmatSpDescr;
2586 #endif
2587   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2588 
2589   PetscFunctionBegin;
2590   MatCheckProduct(C, 1);
2591   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2592   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2593   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2594   mmdata = (MatMatCusparse *)C->product->data;
2595   A      = product->A;
2596   B      = product->B;
2597   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2598     mmdata->reusesym = PETSC_FALSE;
2599     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2600     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2601     Cmat = Ccusp->mat;
2602     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2603     Ccsr = (CsrMatrix *)Cmat->mat;
2604     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2605     goto finalize;
2606   }
2607   if (!c->nz) goto finalize;
2608   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2609   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2610   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2611   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2612   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2613   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2614   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2615   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2616   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2617   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2618   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2619   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2620   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2621   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2622 
2623   ptype = product->type;
2624   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2625     ptype = MATPRODUCT_AB;
2626     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2627   }
2628   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2629     ptype = MATPRODUCT_AB;
2630     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2631   }
2632   switch (ptype) {
2633   case MATPRODUCT_AB:
2634     Amat = Acusp->mat;
2635     Bmat = Bcusp->mat;
2636     break;
2637   case MATPRODUCT_AtB:
2638     Amat = Acusp->matTranspose;
2639     Bmat = Bcusp->mat;
2640     break;
2641   case MATPRODUCT_ABt:
2642     Amat = Acusp->mat;
2643     Bmat = Bcusp->matTranspose;
2644     break;
2645   default:
2646     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2647   }
2648   Cmat = Ccusp->mat;
2649   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2650   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2651   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2652   Acsr = (CsrMatrix *)Amat->mat;
2653   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2654   Ccsr = (CsrMatrix *)Cmat->mat;
2655   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2656   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2657   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2658   PetscCall(PetscLogGpuTimeBegin());
2659 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2660   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2661   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2662   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2663   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2664   PetscCallCUSPARSE(stat);
2665   #else
2666   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2667   PetscCallCUSPARSE(stat);
2668   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2669   PetscCallCUSPARSE(stat);
2670   #endif
2671 #else
2672   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2673                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2674   PetscCallCUSPARSE(stat);
2675 #endif
2676   PetscCall(PetscLogGpuFlops(mmdata->flops));
2677   PetscCallCUDA(WaitForCUDA());
2678   PetscCall(PetscLogGpuTimeEnd());
2679   C->offloadmask = PETSC_OFFLOAD_GPU;
2680 finalize:
2681   /* shorter version of MatAssemblyEnd_SeqAIJ */
2682   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2683   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2684   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2685   c->reallocs = 0;
2686   C->info.mallocs += 0;
2687   C->info.nz_unneeded = 0;
2688   C->assembled = C->was_assembled = PETSC_TRUE;
2689   C->num_ass++;
2690   PetscFunctionReturn(0);
2691 }
2692 
2693 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2694 {
2695   Mat_Product                  *product = C->product;
2696   Mat                           A, B;
2697   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2698   Mat_SeqAIJ                   *a, *b, *c;
2699   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2700   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2701   PetscInt                      i, j, m, n, k;
2702   PetscBool                     flg;
2703   cusparseStatus_t              stat;
2704   MatProductType                ptype;
2705   MatMatCusparse               *mmdata;
2706   PetscLogDouble                flops;
2707   PetscBool                     biscompressed, ciscompressed;
2708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2709   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2710   cusparseSpMatDescr_t BmatSpDescr;
2711 #else
2712   int cnz;
2713 #endif
2714   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2715 
2716   PetscFunctionBegin;
2717   MatCheckProduct(C, 1);
2718   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2719   A = product->A;
2720   B = product->B;
2721   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2722   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2723   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2724   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2725   a = (Mat_SeqAIJ *)A->data;
2726   b = (Mat_SeqAIJ *)B->data;
2727   /* product data */
2728   PetscCall(PetscNew(&mmdata));
2729   C->product->data    = mmdata;
2730   C->product->destroy = MatDestroy_MatMatCusparse;
2731 
2732   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2733   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2734   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2735   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2736   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2737   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2738 
2739   ptype = product->type;
2740   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2741     ptype                                          = MATPRODUCT_AB;
2742     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2743   }
2744   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2745     ptype                                          = MATPRODUCT_AB;
2746     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2747   }
2748   biscompressed = PETSC_FALSE;
2749   ciscompressed = PETSC_FALSE;
2750   switch (ptype) {
2751   case MATPRODUCT_AB:
2752     m    = A->rmap->n;
2753     n    = B->cmap->n;
2754     k    = A->cmap->n;
2755     Amat = Acusp->mat;
2756     Bmat = Bcusp->mat;
2757     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2758     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2759     break;
2760   case MATPRODUCT_AtB:
2761     m = A->cmap->n;
2762     n = B->cmap->n;
2763     k = A->rmap->n;
2764     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2765     Amat = Acusp->matTranspose;
2766     Bmat = Bcusp->mat;
2767     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2768     break;
2769   case MATPRODUCT_ABt:
2770     m = A->rmap->n;
2771     n = B->rmap->n;
2772     k = A->cmap->n;
2773     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2774     Amat = Acusp->mat;
2775     Bmat = Bcusp->matTranspose;
2776     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2777     break;
2778   default:
2779     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2780   }
2781 
2782   /* create cusparse matrix */
2783   PetscCall(MatSetSizes(C, m, n, m, n));
2784   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2785   c     = (Mat_SeqAIJ *)C->data;
2786   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2787   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2788   Ccsr  = new CsrMatrix;
2789 
2790   c->compressedrow.use = ciscompressed;
2791   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2792     c->compressedrow.nrows = a->compressedrow.nrows;
2793     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2794     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2795     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2796     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2797     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2798   } else {
2799     c->compressedrow.nrows  = 0;
2800     c->compressedrow.i      = NULL;
2801     c->compressedrow.rindex = NULL;
2802     Ccusp->workVector       = NULL;
2803     Cmat->cprowIndices      = NULL;
2804   }
2805   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2806   Ccusp->mat        = Cmat;
2807   Ccusp->mat->mat   = Ccsr;
2808   Ccsr->num_rows    = Ccusp->nrows;
2809   Ccsr->num_cols    = n;
2810   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2811   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2812   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2813   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2814   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2815   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2816   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2817   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2818   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2819   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2820   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2821     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2822     c->nz                = 0;
2823     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2824     Ccsr->values         = new THRUSTARRAY(c->nz);
2825     goto finalizesym;
2826   }
2827 
2828   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2829   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2830   Acsr = (CsrMatrix *)Amat->mat;
2831   if (!biscompressed) {
2832     Bcsr = (CsrMatrix *)Bmat->mat;
2833 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2834     BmatSpDescr = Bmat->matDescr;
2835 #endif
2836   } else { /* we need to use row offsets for the full matrix */
2837     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2838     Bcsr                 = new CsrMatrix;
2839     Bcsr->num_rows       = B->rmap->n;
2840     Bcsr->num_cols       = cBcsr->num_cols;
2841     Bcsr->num_entries    = cBcsr->num_entries;
2842     Bcsr->column_indices = cBcsr->column_indices;
2843     Bcsr->values         = cBcsr->values;
2844     if (!Bcusp->rowoffsets_gpu) {
2845       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2846       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2847       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2848     }
2849     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2850     mmdata->Bcsr      = Bcsr;
2851 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2852     if (Bcsr->num_rows && Bcsr->num_cols) {
2853       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2854       PetscCallCUSPARSE(stat);
2855     }
2856     BmatSpDescr = mmdata->matSpBDescr;
2857 #endif
2858   }
2859   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2860   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2861   /* precompute flops count */
2862   if (ptype == MATPRODUCT_AB) {
2863     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2864       const PetscInt st = a->i[i];
2865       const PetscInt en = a->i[i + 1];
2866       for (j = st; j < en; j++) {
2867         const PetscInt brow = a->j[j];
2868         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2869       }
2870     }
2871   } else if (ptype == MATPRODUCT_AtB) {
2872     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2873       const PetscInt anzi = a->i[i + 1] - a->i[i];
2874       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2875       flops += (2. * anzi) * bnzi;
2876     }
2877   } else { /* TODO */
2878     flops = 0.;
2879   }
2880 
2881   mmdata->flops = flops;
2882   PetscCall(PetscLogGpuTimeBegin());
2883 
2884 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2885   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2886   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2887   PetscCallCUSPARSE(stat);
2888   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2889   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2890   {
2891     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2892      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2893   */
2894     void *dBuffer1 = NULL;
2895     void *dBuffer2 = NULL;
2896     void *dBuffer3 = NULL;
2897     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2898     size_t bufferSize1 = 0;
2899     size_t bufferSize2 = 0;
2900     size_t bufferSize3 = 0;
2901     size_t bufferSize4 = 0;
2902     size_t bufferSize5 = 0;
2903 
2904     /*----------------------------------------------------------------------*/
2905     /* ask bufferSize1 bytes for external memory */
2906     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2907     PetscCallCUSPARSE(stat);
2908     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2909     /* inspect the matrices A and B to understand the memory requirement for the next step */
2910     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2911     PetscCallCUSPARSE(stat);
2912 
2913     /*----------------------------------------------------------------------*/
2914     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2915     PetscCallCUSPARSE(stat);
2916     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2917     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2918     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2919     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2920     PetscCallCUSPARSE(stat);
2921     PetscCallCUDA(cudaFree(dBuffer1));
2922     PetscCallCUDA(cudaFree(dBuffer2));
2923 
2924     /*----------------------------------------------------------------------*/
2925     /* get matrix C non-zero entries C_nnz1 */
2926     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2927     c->nz = (PetscInt)C_nnz1;
2928     /* allocate matrix C */
2929     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2930     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2931     Ccsr->values = new THRUSTARRAY(c->nz);
2932     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2933     /* update matC with the new pointers */
2934     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2935     PetscCallCUSPARSE(stat);
2936 
2937     /*----------------------------------------------------------------------*/
2938     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2939     PetscCallCUSPARSE(stat);
2940     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2941     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2942     PetscCallCUSPARSE(stat);
2943     PetscCallCUDA(cudaFree(dBuffer3));
2944     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2945     PetscCallCUSPARSE(stat);
2946     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2947   }
2948   #else
2949   size_t bufSize2;
2950   /* ask bufferSize bytes for external memory */
2951   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2952   PetscCallCUSPARSE(stat);
2953   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2954   /* inspect the matrices A and B to understand the memory requirement for the next step */
2955   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2956   PetscCallCUSPARSE(stat);
2957   /* ask bufferSize again bytes for external memory */
2958   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2959   PetscCallCUSPARSE(stat);
2960   /* The CUSPARSE documentation is not clear, nor the API
2961      We need both buffers to perform the operations properly!
2962      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2963      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2964      is stored in the descriptor! What a messy API... */
2965   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2966   /* compute the intermediate product of A * B */
2967   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2968   PetscCallCUSPARSE(stat);
2969   /* get matrix C non-zero entries C_nnz1 */
2970   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2971   c->nz = (PetscInt)C_nnz1;
2972   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2973                       mmdata->mmBufferSize / 1024));
2974   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2975   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2976   Ccsr->values = new THRUSTARRAY(c->nz);
2977   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2978   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2979   PetscCallCUSPARSE(stat);
2980   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2981   PetscCallCUSPARSE(stat);
2982   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2983 #else
2984   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2985   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2986                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2987   PetscCallCUSPARSE(stat);
2988   c->nz = cnz;
2989   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2990   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2991   Ccsr->values = new THRUSTARRAY(c->nz);
2992   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2993 
2994   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2995   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2996      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2997      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2998   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2999                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3000   PetscCallCUSPARSE(stat);
3001 #endif
3002   PetscCall(PetscLogGpuFlops(mmdata->flops));
3003   PetscCall(PetscLogGpuTimeEnd());
3004 finalizesym:
3005   c->singlemalloc = PETSC_FALSE;
3006   c->free_a       = PETSC_TRUE;
3007   c->free_ij      = PETSC_TRUE;
3008   PetscCall(PetscMalloc1(m + 1, &c->i));
3009   PetscCall(PetscMalloc1(c->nz, &c->j));
3010   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3011     PetscInt      *d_i = c->i;
3012     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3013     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3014     ii = *Ccsr->row_offsets;
3015     jj = *Ccsr->column_indices;
3016     if (ciscompressed) d_i = c->compressedrow.i;
3017     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3018     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3019   } else {
3020     PetscInt *d_i = c->i;
3021     if (ciscompressed) d_i = c->compressedrow.i;
3022     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3023     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3024   }
3025   if (ciscompressed) { /* need to expand host row offsets */
3026     PetscInt r = 0;
3027     c->i[0]    = 0;
3028     for (k = 0; k < c->compressedrow.nrows; k++) {
3029       const PetscInt next = c->compressedrow.rindex[k];
3030       const PetscInt old  = c->compressedrow.i[k];
3031       for (; r < next; r++) c->i[r + 1] = old;
3032     }
3033     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3034   }
3035   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3036   PetscCall(PetscMalloc1(m, &c->ilen));
3037   PetscCall(PetscMalloc1(m, &c->imax));
3038   c->maxnz         = c->nz;
3039   c->nonzerorowcnt = 0;
3040   c->rmax          = 0;
3041   for (k = 0; k < m; k++) {
3042     const PetscInt nn = c->i[k + 1] - c->i[k];
3043     c->ilen[k] = c->imax[k] = nn;
3044     c->nonzerorowcnt += (PetscInt) !!nn;
3045     c->rmax = PetscMax(c->rmax, nn);
3046   }
3047   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3048   PetscCall(PetscMalloc1(c->nz, &c->a));
3049   Ccsr->num_entries = c->nz;
3050 
3051   C->nonzerostate++;
3052   PetscCall(PetscLayoutSetUp(C->rmap));
3053   PetscCall(PetscLayoutSetUp(C->cmap));
3054   Ccusp->nonzerostate = C->nonzerostate;
3055   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3056   C->preallocated     = PETSC_TRUE;
3057   C->assembled        = PETSC_FALSE;
3058   C->was_assembled    = PETSC_FALSE;
3059   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3060     mmdata->reusesym = PETSC_TRUE;
3061     C->offloadmask   = PETSC_OFFLOAD_GPU;
3062   }
3063   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3064   PetscFunctionReturn(0);
3065 }
3066 
3067 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3068 
3069 /* handles sparse or dense B */
3070 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3071 {
3072   Mat_Product *product = mat->product;
3073   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3074 
3075   PetscFunctionBegin;
3076   MatCheckProduct(mat, 1);
3077   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3078   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3079   if (product->type == MATPRODUCT_ABC) {
3080     Ciscusp = PETSC_FALSE;
3081     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3082   }
3083   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3084     PetscBool usecpu = PETSC_FALSE;
3085     switch (product->type) {
3086     case MATPRODUCT_AB:
3087       if (product->api_user) {
3088         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3089         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3090         PetscOptionsEnd();
3091       } else {
3092         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3093         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3094         PetscOptionsEnd();
3095       }
3096       break;
3097     case MATPRODUCT_AtB:
3098       if (product->api_user) {
3099         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3100         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3101         PetscOptionsEnd();
3102       } else {
3103         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3104         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3105         PetscOptionsEnd();
3106       }
3107       break;
3108     case MATPRODUCT_PtAP:
3109       if (product->api_user) {
3110         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3111         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3112         PetscOptionsEnd();
3113       } else {
3114         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3115         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3116         PetscOptionsEnd();
3117       }
3118       break;
3119     case MATPRODUCT_RARt:
3120       if (product->api_user) {
3121         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3122         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3123         PetscOptionsEnd();
3124       } else {
3125         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3126         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3127         PetscOptionsEnd();
3128       }
3129       break;
3130     case MATPRODUCT_ABC:
3131       if (product->api_user) {
3132         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3133         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3134         PetscOptionsEnd();
3135       } else {
3136         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3137         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3138         PetscOptionsEnd();
3139       }
3140       break;
3141     default:
3142       break;
3143     }
3144     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3145   }
3146   /* dispatch */
3147   if (isdense) {
3148     switch (product->type) {
3149     case MATPRODUCT_AB:
3150     case MATPRODUCT_AtB:
3151     case MATPRODUCT_ABt:
3152     case MATPRODUCT_PtAP:
3153     case MATPRODUCT_RARt:
3154       if (product->A->boundtocpu) {
3155         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3156       } else {
3157         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3158       }
3159       break;
3160     case MATPRODUCT_ABC:
3161       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3162       break;
3163     default:
3164       break;
3165     }
3166   } else if (Biscusp && Ciscusp) {
3167     switch (product->type) {
3168     case MATPRODUCT_AB:
3169     case MATPRODUCT_AtB:
3170     case MATPRODUCT_ABt:
3171       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3172       break;
3173     case MATPRODUCT_PtAP:
3174     case MATPRODUCT_RARt:
3175     case MATPRODUCT_ABC:
3176       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3177       break;
3178     default:
3179       break;
3180     }
3181   } else { /* fallback for AIJ */
3182     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3183   }
3184   PetscFunctionReturn(0);
3185 }
3186 
3187 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3188 {
3189   PetscFunctionBegin;
3190   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3191   PetscFunctionReturn(0);
3192 }
3193 
3194 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3195 {
3196   PetscFunctionBegin;
3197   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3198   PetscFunctionReturn(0);
3199 }
3200 
3201 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3202 {
3203   PetscFunctionBegin;
3204   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3205   PetscFunctionReturn(0);
3206 }
3207 
3208 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3209 {
3210   PetscFunctionBegin;
3211   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3212   PetscFunctionReturn(0);
3213 }
3214 
3215 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3216 {
3217   PetscFunctionBegin;
3218   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3219   PetscFunctionReturn(0);
3220 }
3221 
3222 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3223 {
3224   int i = blockIdx.x * blockDim.x + threadIdx.x;
3225   if (i < n) y[idx[i]] += x[i];
3226 }
3227 
3228 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3229 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3230 {
3231   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3232   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3233   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3234   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3235   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3236   PetscBool                     compressed;
3237 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3238   PetscInt nx, ny;
3239 #endif
3240 
3241   PetscFunctionBegin;
3242   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3243   if (!a->nz) {
3244     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
3245     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3246     PetscFunctionReturn(0);
3247   }
3248   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3249   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3250   if (!trans) {
3251     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3252     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3253   } else {
3254     if (herm || !A->form_explicit_transpose) {
3255       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3256       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3257     } else {
3258       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3259       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3260     }
3261   }
3262   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3263   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3264 
3265   try {
3266     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3267     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3268     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3269 
3270     PetscCall(PetscLogGpuTimeBegin());
3271     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3272       /* z = A x + beta y.
3273          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3274          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3275       */
3276       xptr = xarray;
3277       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3278       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3279 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3280       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3281           allocated to accommodate different uses. So we get the length info directly from mat.
3282        */
3283       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3284         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3285         nx             = mat->num_cols;
3286         ny             = mat->num_rows;
3287       }
3288 #endif
3289     } else {
3290       /* z = A^T x + beta y
3291          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3292          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3293        */
3294       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3295       dptr = zarray;
3296       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3297       if (compressed) { /* Scatter x to work vector */
3298         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3299 
3300         thrust::for_each(
3301 #if PetscDefined(HAVE_THRUST_ASYNC)
3302           thrust::cuda::par.on(PetscDefaultCudaStream),
3303 #endif
3304           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3305           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3306       }
3307 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3308       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3309         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3310         nx             = mat->num_rows;
3311         ny             = mat->num_cols;
3312       }
3313 #endif
3314     }
3315 
3316     /* csr_spmv does y = alpha op(A) x + beta y */
3317     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3318 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3319       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3320       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3321         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3322         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3323         PetscCallCUSPARSE(
3324           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3325         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3326 
3327         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3328       } else {
3329         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3330         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3331         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3332       }
3333 
3334       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3335                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3336 #else
3337       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3338       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3339 #endif
3340     } else {
3341       if (cusparsestruct->nrows) {
3342 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3343         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3344 #else
3345         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3346         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3347 #endif
3348       }
3349     }
3350     PetscCall(PetscLogGpuTimeEnd());
3351 
3352     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3353       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3354         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3355           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3356         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3357           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
3358         }
3359       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3360         PetscCall(VecSet_SeqCUDA(zz, 0));
3361       }
3362 
3363       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3364       if (compressed) {
3365         PetscCall(PetscLogGpuTimeBegin());
3366         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3367            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3368            prevent that. So I just add a ScatterAdd kernel.
3369          */
3370 #if 0
3371         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3372         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3373                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3374                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3375                          VecCUDAPlusEquals());
3376 #else
3377         PetscInt n = matstruct->cprowIndices->size();
3378         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3379 #endif
3380         PetscCall(PetscLogGpuTimeEnd());
3381       }
3382     } else {
3383       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3384     }
3385     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3386     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3387     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3388   } catch (char *ex) {
3389     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3390   }
3391   if (yy) {
3392     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3393   } else {
3394     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3395   }
3396   PetscFunctionReturn(0);
3397 }
3398 
3399 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3400 {
3401   PetscFunctionBegin;
3402   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3403   PetscFunctionReturn(0);
3404 }
3405 
3406 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3407 {
3408   PetscObjectState    onnz = A->nonzerostate;
3409   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3410 
3411   PetscFunctionBegin;
3412   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3413   if (onnz != A->nonzerostate && cusp->deviceMat) {
3414     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3415     PetscCallCUDA(cudaFree(cusp->deviceMat));
3416     cusp->deviceMat = NULL;
3417   }
3418   PetscFunctionReturn(0);
3419 }
3420 
3421 /* --------------------------------------------------------------------------------*/
3422 /*@
3423    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3424    (the default parallel PETSc format). This matrix will ultimately pushed down
3425    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3426    assembly performance the user should preallocate the matrix storage by setting
3427    the parameter nz (or the array nnz).  By setting these parameters accurately,
3428    performance during matrix assembly can be increased by more than a factor of 50.
3429 
3430    Collective
3431 
3432    Input Parameters:
3433 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3434 .  m - number of rows
3435 .  n - number of columns
3436 .  nz - number of nonzeros per row (same for all rows)
3437 -  nnz - array containing the number of nonzeros in the various rows
3438          (possibly different for each row) or NULL
3439 
3440    Output Parameter:
3441 .  A - the matrix
3442 
3443    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3444    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3445    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3446 
3447    Notes:
3448    If nnz is given then nz is ignored
3449 
3450    The AIJ format, also called
3451    compressed row storage, is fully compatible with standard Fortran 77
3452    storage.  That is, the stored row and column indices can begin at
3453    either one (as in Fortran) or zero.  See the users' manual for details.
3454 
3455    Specify the preallocated storage with either nz or nnz (not both).
3456    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3457    allocation.  For large problems you MUST preallocate memory or you
3458    will get TERRIBLE performance, see the users' manual chapter on matrices.
3459 
3460    By default, this format uses inodes (identical nodes) when possible, to
3461    improve numerical efficiency of matrix-vector products and solves. We
3462    search for consecutive rows with the same nonzero structure, thereby
3463    reusing matrix information to achieve increased efficiency.
3464 
3465    Level: intermediate
3466 
3467 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3468 @*/
3469 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3470 {
3471   PetscFunctionBegin;
3472   PetscCall(MatCreate(comm, A));
3473   PetscCall(MatSetSizes(*A, m, n, m, n));
3474   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3475   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3476   PetscFunctionReturn(0);
3477 }
3478 
3479 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3480 {
3481   PetscFunctionBegin;
3482   if (A->factortype == MAT_FACTOR_NONE) {
3483     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3484   } else {
3485     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3486   }
3487   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3488   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3489   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3490   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3491   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3492   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3494   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3495   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3496   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3497   PetscCall(MatDestroy_SeqAIJ(A));
3498   PetscFunctionReturn(0);
3499 }
3500 
3501 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3502 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3503 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3504 {
3505   PetscFunctionBegin;
3506   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3507   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3508   PetscFunctionReturn(0);
3509 }
3510 
3511 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3512 {
3513   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3514   Mat_SeqAIJCUSPARSE *cy;
3515   Mat_SeqAIJCUSPARSE *cx;
3516   PetscScalar        *ay;
3517   const PetscScalar  *ax;
3518   CsrMatrix          *csry, *csrx;
3519 
3520   PetscFunctionBegin;
3521   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3522   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3523   if (X->ops->axpy != Y->ops->axpy) {
3524     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3525     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3526     PetscFunctionReturn(0);
3527   }
3528   /* if we are here, it means both matrices are bound to GPU */
3529   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3530   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3531   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3532   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3533   csry = (CsrMatrix *)cy->mat->mat;
3534   csrx = (CsrMatrix *)cx->mat->mat;
3535   /* see if we can turn this into a cublas axpy */
3536   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3537     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3538     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3539     if (eq) str = SAME_NONZERO_PATTERN;
3540   }
3541   /* spgeam is buggy with one column */
3542   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3543 
3544   if (str == SUBSET_NONZERO_PATTERN) {
3545     PetscScalar b = 1.0;
3546 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3547     size_t bufferSize;
3548     void  *buffer;
3549 #endif
3550 
3551     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3552     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3553     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3554 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3555     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3556                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3557     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3558     PetscCall(PetscLogGpuTimeBegin());
3559     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3560                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3561     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3562     PetscCall(PetscLogGpuTimeEnd());
3563     PetscCallCUDA(cudaFree(buffer));
3564 #else
3565     PetscCall(PetscLogGpuTimeBegin());
3566     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3567                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3568     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3569     PetscCall(PetscLogGpuTimeEnd());
3570 #endif
3571     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3572     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3573     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3574     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3575   } else if (str == SAME_NONZERO_PATTERN) {
3576     cublasHandle_t cublasv2handle;
3577     PetscBLASInt   one = 1, bnz = 1;
3578 
3579     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3580     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3581     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3582     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3583     PetscCall(PetscLogGpuTimeBegin());
3584     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3585     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3586     PetscCall(PetscLogGpuTimeEnd());
3587     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3588     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3589     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3590   } else {
3591     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3592     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3593   }
3594   PetscFunctionReturn(0);
3595 }
3596 
3597 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3598 {
3599   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3600   PetscScalar   *ay;
3601   cublasHandle_t cublasv2handle;
3602   PetscBLASInt   one = 1, bnz = 1;
3603 
3604   PetscFunctionBegin;
3605   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3606   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3607   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3608   PetscCall(PetscLogGpuTimeBegin());
3609   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3610   PetscCall(PetscLogGpuFlops(bnz));
3611   PetscCall(PetscLogGpuTimeEnd());
3612   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3613   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3614   PetscFunctionReturn(0);
3615 }
3616 
3617 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3618 {
3619   PetscBool   both = PETSC_FALSE;
3620   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3621 
3622   PetscFunctionBegin;
3623   if (A->factortype == MAT_FACTOR_NONE) {
3624     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3625     if (spptr->mat) {
3626       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3627       if (matrix->values) {
3628         both = PETSC_TRUE;
3629         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3630       }
3631     }
3632     if (spptr->matTranspose) {
3633       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3634       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3635     }
3636   }
3637   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3638   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3639   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3640   else A->offloadmask = PETSC_OFFLOAD_CPU;
3641   PetscFunctionReturn(0);
3642 }
3643 
3644 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3645 {
3646   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3647 
3648   PetscFunctionBegin;
3649   if (A->factortype != MAT_FACTOR_NONE) {
3650     A->boundtocpu = flg;
3651     PetscFunctionReturn(0);
3652   }
3653   if (flg) {
3654     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3655 
3656     A->ops->scale                     = MatScale_SeqAIJ;
3657     A->ops->axpy                      = MatAXPY_SeqAIJ;
3658     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3659     A->ops->mult                      = MatMult_SeqAIJ;
3660     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3661     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3662     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3663     A->ops->multhermitiantranspose    = NULL;
3664     A->ops->multhermitiantransposeadd = NULL;
3665     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3666     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3667     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3668     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3669     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3670     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3671     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3672     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3673   } else {
3674     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3675     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3676     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3677     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3678     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3679     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3680     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3681     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3682     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3683     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3684     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3685     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3686     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3687     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3688     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3689     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3690     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3691 
3692     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3693     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3694     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3695     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3696     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3697     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3698   }
3699   A->boundtocpu = flg;
3700   if (flg && a->inode.size) {
3701     a->inode.use = PETSC_TRUE;
3702   } else {
3703     a->inode.use = PETSC_FALSE;
3704   }
3705   PetscFunctionReturn(0);
3706 }
3707 
3708 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3709 {
3710   Mat B;
3711 
3712   PetscFunctionBegin;
3713   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3714   if (reuse == MAT_INITIAL_MATRIX) {
3715     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3716   } else if (reuse == MAT_REUSE_MATRIX) {
3717     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3718   }
3719   B = *newmat;
3720 
3721   PetscCall(PetscFree(B->defaultvectype));
3722   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3723 
3724   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3725     if (B->factortype == MAT_FACTOR_NONE) {
3726       Mat_SeqAIJCUSPARSE *spptr;
3727       PetscCall(PetscNew(&spptr));
3728       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3729       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3730       spptr->format = MAT_CUSPARSE_CSR;
3731 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3732   #if CUSPARSE_VERSION > 11301
3733       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3734   #else
3735       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3736   #endif
3737       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3738       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3739 #endif
3740       B->spptr = spptr;
3741     } else {
3742       Mat_SeqAIJCUSPARSETriFactors *spptr;
3743 
3744       PetscCall(PetscNew(&spptr));
3745       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3746       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3747       B->spptr = spptr;
3748     }
3749     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3750   }
3751   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3752   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3753   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3754   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3755   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3756   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3757 
3758   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3759   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3760   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3761 #if defined(PETSC_HAVE_HYPRE)
3762   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3763 #endif
3764   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3765   PetscFunctionReturn(0);
3766 }
3767 
3768 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3769 {
3770   PetscFunctionBegin;
3771   PetscCall(MatCreate_SeqAIJ(B));
3772   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3773   PetscFunctionReturn(0);
3774 }
3775 
3776 /*MC
3777    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3778 
3779    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3780    CSR, ELL, or Hybrid format.
3781    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3782 
3783    Options Database Keys:
3784 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3785 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3786 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3787 +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3788 
3789   Level: beginner
3790 
3791 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3792 M*/
3793 
3794 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3795 
3796 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3797 {
3798   PetscFunctionBegin;
3799   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3800   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3801   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3802   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3803   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3804 
3805   PetscFunctionReturn(0);
3806 }
3807 
3808 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3809 {
3810   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3811 
3812   PetscFunctionBegin;
3813   if (!cusp) PetscFunctionReturn(0);
3814   delete cusp->cooPerm;
3815   delete cusp->cooPerm_a;
3816   cusp->cooPerm   = NULL;
3817   cusp->cooPerm_a = NULL;
3818   if (cusp->use_extended_coo) {
3819     PetscCallCUDA(cudaFree(cusp->jmap_d));
3820     PetscCallCUDA(cudaFree(cusp->perm_d));
3821   }
3822   cusp->use_extended_coo = PETSC_FALSE;
3823   PetscFunctionReturn(0);
3824 }
3825 
3826 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3827 {
3828   PetscFunctionBegin;
3829   if (*cusparsestruct) {
3830     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3831     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3832     delete (*cusparsestruct)->workVector;
3833     delete (*cusparsestruct)->rowoffsets_gpu;
3834     delete (*cusparsestruct)->cooPerm;
3835     delete (*cusparsestruct)->cooPerm_a;
3836     delete (*cusparsestruct)->csr2csc_i;
3837     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3838     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3839     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3840     PetscCall(PetscFree(*cusparsestruct));
3841   }
3842   PetscFunctionReturn(0);
3843 }
3844 
3845 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3846 {
3847   PetscFunctionBegin;
3848   if (*mat) {
3849     delete (*mat)->values;
3850     delete (*mat)->column_indices;
3851     delete (*mat)->row_offsets;
3852     delete *mat;
3853     *mat = 0;
3854   }
3855   PetscFunctionReturn(0);
3856 }
3857 
3858 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3859 {
3860   PetscFunctionBegin;
3861   if (*trifactor) {
3862     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3863     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3864     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3865     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3866     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3868     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3869 #endif
3870     PetscCall(PetscFree(*trifactor));
3871   }
3872   PetscFunctionReturn(0);
3873 }
3874 
3875 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3876 {
3877   CsrMatrix *mat;
3878 
3879   PetscFunctionBegin;
3880   if (*matstruct) {
3881     if ((*matstruct)->mat) {
3882       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3884         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3885 #else
3886         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3887         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3888 #endif
3889       } else {
3890         mat = (CsrMatrix *)(*matstruct)->mat;
3891         CsrMatrix_Destroy(&mat);
3892       }
3893     }
3894     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3895     delete (*matstruct)->cprowIndices;
3896     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3897     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3898     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3899 
3900 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3901     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3902     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3903     for (int i = 0; i < 3; i++) {
3904       if (mdata->cuSpMV[i].initialized) {
3905         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3906         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3907         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3908       }
3909     }
3910 #endif
3911     delete *matstruct;
3912     *matstruct = NULL;
3913   }
3914   PetscFunctionReturn(0);
3915 }
3916 
3917 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3918 {
3919   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3920 
3921   PetscFunctionBegin;
3922   if (fs) {
3923     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3924     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3925     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3926     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3927     delete fs->rpermIndices;
3928     delete fs->cpermIndices;
3929     delete fs->workVector;
3930     fs->rpermIndices = NULL;
3931     fs->cpermIndices = NULL;
3932     fs->workVector   = NULL;
3933     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3934     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3935     fs->init_dev_prop = PETSC_FALSE;
3936 #if CUSPARSE_VERSION >= 11500
3937     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3938     PetscCallCUDA(cudaFree(fs->csrColIdx));
3939     PetscCallCUDA(cudaFree(fs->csrVal));
3940     PetscCallCUDA(cudaFree(fs->X));
3941     PetscCallCUDA(cudaFree(fs->Y));
3942     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3943     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3944     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3945     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3946     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3947     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3948     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3949     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3950     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3951     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3952     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3953     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3954     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3955     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3956     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3957     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3958 
3959     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3960     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3961 #endif
3962   }
3963   PetscFunctionReturn(0);
3964 }
3965 
3966 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3967 {
3968   cusparseHandle_t handle;
3969 
3970   PetscFunctionBegin;
3971   if (*trifactors) {
3972     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3973     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
3974     PetscCall(PetscFree(*trifactors));
3975   }
3976   PetscFunctionReturn(0);
3977 }
3978 
3979 struct IJCompare {
3980   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3981   {
3982     if (t1.get<0>() < t2.get<0>()) return true;
3983     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3984     return false;
3985   }
3986 };
3987 
3988 struct IJEqual {
3989   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3990   {
3991     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3992     return true;
3993   }
3994 };
3995 
3996 struct IJDiff {
3997   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3998 };
3999 
4000 struct IJSum {
4001   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4002 };
4003 
4004 #include <thrust/iterator/discard_iterator.h>
4005 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4006 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4007 {
4008   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4009   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4010   THRUSTARRAY                          *cooPerm_v = NULL;
4011   thrust::device_ptr<const PetscScalar> d_v;
4012   CsrMatrix                            *matrix;
4013   PetscInt                              n;
4014 
4015   PetscFunctionBegin;
4016   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4017   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4018   if (!cusp->cooPerm) {
4019     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4020     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4021     PetscFunctionReturn(0);
4022   }
4023   matrix = (CsrMatrix *)cusp->mat->mat;
4024   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4025   if (!v) {
4026     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4027     goto finalize;
4028   }
4029   n = cusp->cooPerm->size();
4030   if (isCudaMem(v)) {
4031     d_v = thrust::device_pointer_cast(v);
4032   } else {
4033     cooPerm_v = new THRUSTARRAY(n);
4034     cooPerm_v->assign(v, v + n);
4035     d_v = cooPerm_v->data();
4036     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4037   }
4038   PetscCall(PetscLogGpuTimeBegin());
4039   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4040     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4041       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4042       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4043       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4044         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4045         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4046       */
4047       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4048       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4049       delete cooPerm_w;
4050     } else {
4051       /* all nonzeros in d_v[] are unique entries */
4052       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4053       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4054       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4055     }
4056   } else {
4057     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4058       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4059       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4060     } else {
4061       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4062       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4063       thrust::for_each(zibit, zieit, VecCUDAEquals());
4064     }
4065   }
4066   PetscCall(PetscLogGpuTimeEnd());
4067 finalize:
4068   delete cooPerm_v;
4069   A->offloadmask = PETSC_OFFLOAD_GPU;
4070   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4071   /* shorter version of MatAssemblyEnd_SeqAIJ */
4072   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4073   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4074   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4075   a->reallocs = 0;
4076   A->info.mallocs += 0;
4077   A->info.nz_unneeded = 0;
4078   A->assembled = A->was_assembled = PETSC_TRUE;
4079   A->num_ass++;
4080   PetscFunctionReturn(0);
4081 }
4082 
4083 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4084 {
4085   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4086 
4087   PetscFunctionBegin;
4088   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4089   if (!cusp) PetscFunctionReturn(0);
4090   if (destroy) {
4091     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4092     delete cusp->csr2csc_i;
4093     cusp->csr2csc_i = NULL;
4094   }
4095   A->transupdated = PETSC_FALSE;
4096   PetscFunctionReturn(0);
4097 }
4098 
4099 #include <thrust/binary_search.h>
4100 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4101 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4102 {
4103   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4104   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4105   PetscInt            cooPerm_n, nzr = 0;
4106 
4107   PetscFunctionBegin;
4108   PetscCall(PetscLayoutSetUp(A->rmap));
4109   PetscCall(PetscLayoutSetUp(A->cmap));
4110   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4111   if (n != cooPerm_n) {
4112     delete cusp->cooPerm;
4113     delete cusp->cooPerm_a;
4114     cusp->cooPerm   = NULL;
4115     cusp->cooPerm_a = NULL;
4116   }
4117   if (n) {
4118     thrust::device_ptr<PetscInt> d_i, d_j;
4119     PetscInt                    *d_raw_i, *d_raw_j;
4120     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4121     PetscMemType                 imtype, jmtype;
4122 
4123     PetscCall(PetscGetMemType(coo_i, &imtype));
4124     if (PetscMemTypeHost(imtype)) {
4125       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4126       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4127       d_i        = thrust::device_pointer_cast(d_raw_i);
4128       free_raw_i = PETSC_TRUE;
4129       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4130     } else {
4131       d_i = thrust::device_pointer_cast(coo_i);
4132     }
4133 
4134     PetscCall(PetscGetMemType(coo_j, &jmtype));
4135     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4136       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4137       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4138       d_j        = thrust::device_pointer_cast(d_raw_j);
4139       free_raw_j = PETSC_TRUE;
4140       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4141     } else {
4142       d_j = thrust::device_pointer_cast(coo_j);
4143     }
4144 
4145     THRUSTINTARRAY ii(A->rmap->n);
4146 
4147     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4148     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4149 
4150     /* Ex.
4151       n = 6
4152       coo_i = [3,3,1,4,1,4]
4153       coo_j = [3,2,2,5,2,6]
4154     */
4155     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4156     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4157 
4158     PetscCall(PetscLogGpuTimeBegin());
4159     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4160     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4161     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4162     THRUSTINTARRAY w(d_j, d_j + n);
4163 
4164     /*
4165       d_i     = [1,1,3,3,4,4]
4166       d_j     = [2,2,2,3,5,6]
4167       cooPerm = [2,4,1,0,3,5]
4168     */
4169     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4170 
4171     /*
4172       d_i     = [1,3,3,4,4,x]
4173                             ^ekey
4174       d_j     = [2,2,3,5,6,x]
4175                            ^nekye
4176     */
4177     if (nekey == ekey) { /* all entries are unique */
4178       delete cusp->cooPerm_a;
4179       cusp->cooPerm_a = NULL;
4180     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4181       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4182       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4183       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4184       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4185       w[0]                  = 0;
4186       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4187       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4188     }
4189     thrust::counting_iterator<PetscInt> search_begin(0);
4190     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4191                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4192                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4193     PetscCall(PetscLogGpuTimeEnd());
4194 
4195     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4196     a->singlemalloc = PETSC_FALSE;
4197     a->free_a       = PETSC_TRUE;
4198     a->free_ij      = PETSC_TRUE;
4199     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4200     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4201     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4202     a->nz = a->maxnz = a->i[A->rmap->n];
4203     a->rmax          = 0;
4204     PetscCall(PetscMalloc1(a->nz, &a->a));
4205     PetscCall(PetscMalloc1(a->nz, &a->j));
4206     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4207     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4208     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4209     for (PetscInt i = 0; i < A->rmap->n; i++) {
4210       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4211       nzr += (PetscInt) !!(nnzr);
4212       a->ilen[i] = a->imax[i] = nnzr;
4213       a->rmax                 = PetscMax(a->rmax, nnzr);
4214     }
4215     a->nonzerorowcnt = nzr;
4216     A->preallocated  = PETSC_TRUE;
4217     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4218     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4219     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4220     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4221   } else {
4222     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4223   }
4224   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4225 
4226   /* We want to allocate the CUSPARSE struct for matvec now.
4227      The code is so convoluted now that I prefer to copy zeros */
4228   PetscCall(PetscArrayzero(a->a, a->nz));
4229   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4230   A->offloadmask = PETSC_OFFLOAD_CPU;
4231   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4232   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4233   PetscFunctionReturn(0);
4234 }
4235 
4236 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4237 {
4238   Mat_SeqAIJ         *seq;
4239   Mat_SeqAIJCUSPARSE *dev;
4240   PetscBool           coo_basic = PETSC_TRUE;
4241   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4242 
4243   PetscFunctionBegin;
4244   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4245   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4246   if (coo_i) {
4247     PetscCall(PetscGetMemType(coo_i, &mtype));
4248     if (PetscMemTypeHost(mtype)) {
4249       for (PetscCount k = 0; k < coo_n; k++) {
4250         if (coo_i[k] < 0 || coo_j[k] < 0) {
4251           coo_basic = PETSC_FALSE;
4252           break;
4253         }
4254       }
4255     }
4256   }
4257 
4258   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4259     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4260   } else {
4261     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4262     mat->offloadmask = PETSC_OFFLOAD_CPU;
4263     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4264     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4265     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4266     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4267     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4268     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4269     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4270     dev->use_extended_coo = PETSC_TRUE;
4271   }
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4276 {
4277   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4278   const PetscCount grid_size = gridDim.x * blockDim.x;
4279   for (; i < nnz; i += grid_size) {
4280     PetscScalar sum = 0.0;
4281     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4282     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4283   }
4284 }
4285 
4286 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4287 {
4288   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4289   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4290   PetscCount          Annz = seq->nz;
4291   PetscMemType        memtype;
4292   const PetscScalar  *v1 = v;
4293   PetscScalar        *Aa;
4294 
4295   PetscFunctionBegin;
4296   if (dev->use_extended_coo) {
4297     PetscCall(PetscGetMemType(v, &memtype));
4298     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4299       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4300       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4301     }
4302 
4303     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4304     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4305 
4306     if (Annz) {
4307       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4308       PetscCallCUDA(cudaPeekAtLastError());
4309     }
4310 
4311     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4312     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4313 
4314     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4315   } else {
4316     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4317   }
4318   PetscFunctionReturn(0);
4319 }
4320 
4321 /*@C
4322     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
4323 
4324    Not collective
4325 
4326     Input Parameters:
4327 +   A - the matrix
4328 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4329 
4330     Output Parameters:
4331 +   ia - the CSR row pointers
4332 -   ja - the CSR column indices
4333 
4334     Level: developer
4335 
4336     Note:
4337       When compressed is true, the CSR structure does not contain empty rows
4338 
4339 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4340 @*/
4341 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4342 {
4343   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4344   CsrMatrix          *csr;
4345   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4346 
4347   PetscFunctionBegin;
4348   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4349   if (!i || !j) PetscFunctionReturn(0);
4350   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4351   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4352   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4353   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4354   csr = (CsrMatrix *)cusp->mat->mat;
4355   if (i) {
4356     if (!compressed && a->compressedrow.use) { /* need full row offset */
4357       if (!cusp->rowoffsets_gpu) {
4358         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4359         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4360         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4361       }
4362       *i = cusp->rowoffsets_gpu->data().get();
4363     } else *i = csr->row_offsets->data().get();
4364   }
4365   if (j) *j = csr->column_indices->data().get();
4366   PetscFunctionReturn(0);
4367 }
4368 
4369 /*@C
4370     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4371 
4372    Not collective
4373 
4374     Input Parameters:
4375 +   A - the matrix
4376 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4377 
4378     Output Parameters:
4379 +   ia - the CSR row pointers
4380 -   ja - the CSR column indices
4381 
4382     Level: developer
4383 
4384 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4385 @*/
4386 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4387 {
4388   PetscFunctionBegin;
4389   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4390   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4391   if (i) *i = NULL;
4392   if (j) *j = NULL;
4393   PetscFunctionReturn(0);
4394 }
4395 
4396 /*@C
4397    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4398 
4399    Not Collective
4400 
4401    Input Parameter:
4402 .   A - a `MATSEQAIJCUSPARSE` matrix
4403 
4404    Output Parameter:
4405 .   a - pointer to the device data
4406 
4407    Level: developer
4408 
4409    Note:
4410    May trigger host-device copies if up-to-date matrix data is on host
4411 
4412 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4413 @*/
4414 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4415 {
4416   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4417   CsrMatrix          *csr;
4418 
4419   PetscFunctionBegin;
4420   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4421   PetscValidPointer(a, 2);
4422   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4423   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4424   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4425   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4426   csr = (CsrMatrix *)cusp->mat->mat;
4427   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4428   *a = csr->values->data().get();
4429   PetscFunctionReturn(0);
4430 }
4431 
4432 /*@C
4433    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4434 
4435    Not Collective
4436 
4437    Input Parameter:
4438 .   A - a `MATSEQAIJCUSPARSE` matrix
4439 
4440    Output Parameter:
4441 .   a - pointer to the device data
4442 
4443    Level: developer
4444 
4445 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4446 @*/
4447 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4448 {
4449   PetscFunctionBegin;
4450   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4451   PetscValidPointer(a, 2);
4452   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4453   *a = NULL;
4454   PetscFunctionReturn(0);
4455 }
4456 
4457 /*@C
4458    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4459 
4460    Not Collective
4461 
4462    Input Parameter:
4463 .   A - a `MATSEQAIJCUSPARSE` matrix
4464 
4465    Output Parameter:
4466 .   a - pointer to the device data
4467 
4468    Level: developer
4469 
4470    Note:
4471    May trigger host-device copies if up-to-date matrix data is on host
4472 
4473 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4474 @*/
4475 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4476 {
4477   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4478   CsrMatrix          *csr;
4479 
4480   PetscFunctionBegin;
4481   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4482   PetscValidPointer(a, 2);
4483   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4484   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4485   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4486   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4487   csr = (CsrMatrix *)cusp->mat->mat;
4488   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4489   *a             = csr->values->data().get();
4490   A->offloadmask = PETSC_OFFLOAD_GPU;
4491   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4492   PetscFunctionReturn(0);
4493 }
4494 /*@C
4495    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4496 
4497    Not Collective
4498 
4499    Input Parameter:
4500 .   A - a `MATSEQAIJCUSPARSE` matrix
4501 
4502    Output Parameter:
4503 .   a - pointer to the device data
4504 
4505    Level: developer
4506 
4507 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4508 @*/
4509 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4510 {
4511   PetscFunctionBegin;
4512   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4513   PetscValidPointer(a, 2);
4514   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4515   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4516   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4517   *a = NULL;
4518   PetscFunctionReturn(0);
4519 }
4520 
4521 /*@C
4522    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4523 
4524    Not Collective
4525 
4526    Input Parameter:
4527 .   A - a `MATSEQAIJCUSPARSE` matrix
4528 
4529    Output Parameter:
4530 .   a - pointer to the device data
4531 
4532    Level: developer
4533 
4534    Note:
4535    Does not trigger host-device copies and flags data validity on the GPU
4536 
4537 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4538 @*/
4539 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4540 {
4541   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4542   CsrMatrix          *csr;
4543 
4544   PetscFunctionBegin;
4545   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4546   PetscValidPointer(a, 2);
4547   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4548   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4549   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4550   csr = (CsrMatrix *)cusp->mat->mat;
4551   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4552   *a             = csr->values->data().get();
4553   A->offloadmask = PETSC_OFFLOAD_GPU;
4554   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4555   PetscFunctionReturn(0);
4556 }
4557 
4558 /*@C
4559    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4560 
4561    Not Collective
4562 
4563    Input Parameter:
4564 .   A - a `MATSEQAIJCUSPARSE` matrix
4565 
4566    Output Parameter:
4567 .   a - pointer to the device data
4568 
4569    Level: developer
4570 
4571 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4572 @*/
4573 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4574 {
4575   PetscFunctionBegin;
4576   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4577   PetscValidPointer(a, 2);
4578   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4579   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4580   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4581   *a = NULL;
4582   PetscFunctionReturn(0);
4583 }
4584 
4585 struct IJCompare4 {
4586   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4587   {
4588     if (t1.get<0>() < t2.get<0>()) return true;
4589     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4590     return false;
4591   }
4592 };
4593 
4594 struct Shift {
4595   int _shift;
4596 
4597   Shift(int shift) : _shift(shift) { }
4598   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4599 };
4600 
4601 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4602 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4603 {
4604   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4605   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4606   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4607   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4608   PetscInt                      Annz, Bnnz;
4609   cusparseStatus_t              stat;
4610   PetscInt                      i, m, n, zero = 0;
4611 
4612   PetscFunctionBegin;
4613   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4614   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4615   PetscValidPointer(C, 4);
4616   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4617   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4618   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4619   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4620   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4621   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4622   if (reuse == MAT_INITIAL_MATRIX) {
4623     m = A->rmap->n;
4624     n = A->cmap->n + B->cmap->n;
4625     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4626     PetscCall(MatSetSizes(*C, m, n, m, n));
4627     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4628     c                       = (Mat_SeqAIJ *)(*C)->data;
4629     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4630     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4631     Ccsr                    = new CsrMatrix;
4632     Cmat->cprowIndices      = NULL;
4633     c->compressedrow.use    = PETSC_FALSE;
4634     c->compressedrow.nrows  = 0;
4635     c->compressedrow.i      = NULL;
4636     c->compressedrow.rindex = NULL;
4637     Ccusp->workVector       = NULL;
4638     Ccusp->nrows            = m;
4639     Ccusp->mat              = Cmat;
4640     Ccusp->mat->mat         = Ccsr;
4641     Ccsr->num_rows          = m;
4642     Ccsr->num_cols          = n;
4643     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4644     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4645     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4646     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4647     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4648     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4649     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4650     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4651     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4652     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4653     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4654     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4655     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4656 
4657     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4658     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4659     Annz                 = (PetscInt)Acsr->column_indices->size();
4660     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4661     c->nz                = Annz + Bnnz;
4662     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4663     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4664     Ccsr->values         = new THRUSTARRAY(c->nz);
4665     Ccsr->num_entries    = c->nz;
4666     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4667     if (c->nz) {
4668       auto              Acoo = new THRUSTINTARRAY32(Annz);
4669       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4670       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4671       THRUSTINTARRAY32 *Aroff, *Broff;
4672 
4673       if (a->compressedrow.use) { /* need full row offset */
4674         if (!Acusp->rowoffsets_gpu) {
4675           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4676           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4677           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4678         }
4679         Aroff = Acusp->rowoffsets_gpu;
4680       } else Aroff = Acsr->row_offsets;
4681       if (b->compressedrow.use) { /* need full row offset */
4682         if (!Bcusp->rowoffsets_gpu) {
4683           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4684           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4685           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4686         }
4687         Broff = Bcusp->rowoffsets_gpu;
4688       } else Broff = Bcsr->row_offsets;
4689       PetscCall(PetscLogGpuTimeBegin());
4690       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4691       PetscCallCUSPARSE(stat);
4692       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4693       PetscCallCUSPARSE(stat);
4694       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4695       auto Aperm = thrust::make_constant_iterator(1);
4696       auto Bperm = thrust::make_constant_iterator(0);
4697 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4698       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4699       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4700 #else
4701       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4702       auto Bcib = Bcsr->column_indices->begin();
4703       auto Bcie = Bcsr->column_indices->end();
4704       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4705 #endif
4706       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4707       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4708       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4709       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4710       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4711       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4712       auto p1    = Ccusp->cooPerm->begin();
4713       auto p2    = Ccusp->cooPerm->begin();
4714       thrust::advance(p2, Annz);
4715       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4716 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4717       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4718 #endif
4719       auto cci = thrust::make_counting_iterator(zero);
4720       auto cce = thrust::make_counting_iterator(c->nz);
4721 #if 0 //Errors on SUMMIT cuda 11.1.0
4722       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4723 #else
4724       auto pred = thrust::identity<int>();
4725       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4726       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4727 #endif
4728       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4729       PetscCallCUSPARSE(stat);
4730       PetscCall(PetscLogGpuTimeEnd());
4731       delete wPerm;
4732       delete Acoo;
4733       delete Bcoo;
4734       delete Ccoo;
4735 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4736       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4737       PetscCallCUSPARSE(stat);
4738 #endif
4739       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4740         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4741         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4742         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4743         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4744         CsrMatrix                    *CcsrT = new CsrMatrix;
4745         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4746         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4747 
4748         (*C)->form_explicit_transpose = PETSC_TRUE;
4749         (*C)->transupdated            = PETSC_TRUE;
4750         Ccusp->rowoffsets_gpu         = NULL;
4751         CmatT->cprowIndices           = NULL;
4752         CmatT->mat                    = CcsrT;
4753         CcsrT->num_rows               = n;
4754         CcsrT->num_cols               = m;
4755         CcsrT->num_entries            = c->nz;
4756 
4757         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4758         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4759         CcsrT->values         = new THRUSTARRAY(c->nz);
4760 
4761         PetscCall(PetscLogGpuTimeBegin());
4762         auto rT = CcsrT->row_offsets->begin();
4763         if (AT) {
4764           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4765           thrust::advance(rT, -1);
4766         }
4767         if (BT) {
4768           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4769           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4770           thrust::copy(titb, tite, rT);
4771         }
4772         auto cT = CcsrT->column_indices->begin();
4773         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4774         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4775         auto vT = CcsrT->values->begin();
4776         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4777         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4778         PetscCall(PetscLogGpuTimeEnd());
4779 
4780         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4781         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4782         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4783         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4784         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4785         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4786         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4787         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4788         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4789 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4790         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4791         PetscCallCUSPARSE(stat);
4792 #endif
4793         Ccusp->matTranspose = CmatT;
4794       }
4795     }
4796 
4797     c->singlemalloc = PETSC_FALSE;
4798     c->free_a       = PETSC_TRUE;
4799     c->free_ij      = PETSC_TRUE;
4800     PetscCall(PetscMalloc1(m + 1, &c->i));
4801     PetscCall(PetscMalloc1(c->nz, &c->j));
4802     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4803       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4804       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4805       ii = *Ccsr->row_offsets;
4806       jj = *Ccsr->column_indices;
4807       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4808       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4809     } else {
4810       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4811       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4812     }
4813     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4814     PetscCall(PetscMalloc1(m, &c->ilen));
4815     PetscCall(PetscMalloc1(m, &c->imax));
4816     c->maxnz         = c->nz;
4817     c->nonzerorowcnt = 0;
4818     c->rmax          = 0;
4819     for (i = 0; i < m; i++) {
4820       const PetscInt nn = c->i[i + 1] - c->i[i];
4821       c->ilen[i] = c->imax[i] = nn;
4822       c->nonzerorowcnt += (PetscInt) !!nn;
4823       c->rmax = PetscMax(c->rmax, nn);
4824     }
4825     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4826     PetscCall(PetscMalloc1(c->nz, &c->a));
4827     (*C)->nonzerostate++;
4828     PetscCall(PetscLayoutSetUp((*C)->rmap));
4829     PetscCall(PetscLayoutSetUp((*C)->cmap));
4830     Ccusp->nonzerostate = (*C)->nonzerostate;
4831     (*C)->preallocated  = PETSC_TRUE;
4832   } else {
4833     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4834     c = (Mat_SeqAIJ *)(*C)->data;
4835     if (c->nz) {
4836       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4837       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4838       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4839       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4840       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4841       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4842       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4843       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4844       Acsr = (CsrMatrix *)Acusp->mat->mat;
4845       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4846       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4847       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4848       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4849       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4850       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4851       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4852       auto pmid = Ccusp->cooPerm->begin();
4853       thrust::advance(pmid, Acsr->num_entries);
4854       PetscCall(PetscLogGpuTimeBegin());
4855       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4856       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4857       thrust::for_each(zibait, zieait, VecCUDAEquals());
4858       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4859       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4860       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4861       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4862       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4863         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4864         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4865         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4866         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4867         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4868         auto       vT    = CcsrT->values->begin();
4869         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4870         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4871         (*C)->transupdated = PETSC_TRUE;
4872       }
4873       PetscCall(PetscLogGpuTimeEnd());
4874     }
4875   }
4876   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4877   (*C)->assembled     = PETSC_TRUE;
4878   (*C)->was_assembled = PETSC_FALSE;
4879   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4880   PetscFunctionReturn(0);
4881 }
4882 
4883 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4884 {
4885   bool               dmem;
4886   const PetscScalar *av;
4887 
4888   PetscFunctionBegin;
4889   dmem = isCudaMem(v);
4890   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4891   if (n && idx) {
4892     THRUSTINTARRAY widx(n);
4893     widx.assign(idx, idx + n);
4894     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4895 
4896     THRUSTARRAY                    *w = NULL;
4897     thrust::device_ptr<PetscScalar> dv;
4898     if (dmem) {
4899       dv = thrust::device_pointer_cast(v);
4900     } else {
4901       w  = new THRUSTARRAY(n);
4902       dv = w->data();
4903     }
4904     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4905 
4906     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4907     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4908     thrust::for_each(zibit, zieit, VecCUDAEquals());
4909     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4910     delete w;
4911   } else {
4912     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4913   }
4914   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4915   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4916   PetscFunctionReturn(0);
4917 }
4918