xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 06024a9c98dfd455870bf19bbf2808128d8ceaa8)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24   #include <cuda/std/functional>
25 #endif
26 
27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
30     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
31 
32   typedef enum {
33       CUSPARSE_MV_ALG_DEFAULT = 0,
34       CUSPARSE_COOMV_ALG      = 1,
35       CUSPARSE_CSRMV_ALG1     = 2,
36       CUSPARSE_CSRMV_ALG2     = 3
37   } cusparseSpMVAlg_t;
38 
39   typedef enum {
40       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
41       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
42       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
43       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
44       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
45       CUSPARSE_SPMM_ALG_DEFAULT = 0,
46       CUSPARSE_SPMM_COO_ALG1    = 1,
47       CUSPARSE_SPMM_COO_ALG2    = 2,
48       CUSPARSE_SPMM_COO_ALG3    = 3,
49       CUSPARSE_SPMM_COO_ALG4    = 5,
50       CUSPARSE_SPMM_CSR_ALG1    = 4,
51       CUSPARSE_SPMM_CSR_ALG2    = 6,
52   } cusparseSpMMAlg_t;
53 
54   typedef enum {
55       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
56       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
57   } cusparseCsr2CscAlg_t;
58   */
59 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
60 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
61 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
62 #endif
63 
64 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
66 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
74 #endif
75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
85 
86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
90 
91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
93 
94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97 
98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99 {
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
101 
102   PetscFunctionBegin;
103   switch (op) {
104   case MAT_CUSPARSE_MULT:
105     cusparsestruct->format = format;
106     break;
107   case MAT_CUSPARSE_ALL:
108     cusparsestruct->format = format;
109     break;
110   default:
111     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112   }
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 /*@
117   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118   operation. Only the `MatMult()` operation can use different GPU storage formats
119 
120   Not Collective
121 
122   Input Parameters:
123 + A      - Matrix of type `MATSEQAIJCUSPARSE`
124 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127 
128   Level: intermediate
129 
130 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131 @*/
132 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133 {
134   PetscFunctionBegin;
135   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
136   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137   PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 
140 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141 {
142   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
143 
144   PetscFunctionBegin;
145   cusparsestruct->use_cpu_solve = use_cpu;
146   PetscFunctionReturn(PETSC_SUCCESS);
147 }
148 
149 /*@
150   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
151 
152   Input Parameters:
153 + A       - Matrix of type `MATSEQAIJCUSPARSE`
154 - use_cpu - set flag for using the built-in CPU `MatSolve()`
155 
156   Level: intermediate
157 
158   Note:
159   The cuSparse LU solver currently computes the factors with the built-in CPU method
160   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
162 
163 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
169   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170   PetscFunctionReturn(PETSC_SUCCESS);
171 }
172 
173 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177   case MAT_FORM_EXPLICIT_TRANSPOSE:
178     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180     A->form_explicit_transpose = flg;
181     break;
182   default:
183     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184     break;
185   }
186   PetscFunctionReturn(PETSC_SUCCESS);
187 }
188 
189 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
190 {
191   MatCUSPARSEStorageFormat format;
192   PetscBool                flg;
193   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
194 
195   PetscFunctionBegin;
196   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
197   if (A->factortype == MAT_FACTOR_NONE) {
198     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
199     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
200 
201     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
202     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
203     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
204     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
205 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
206     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
207     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
208   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
209     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #else
211     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
212   #endif
213     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
214     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
215 
216     PetscCall(
217       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
218     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
219 #endif
220   }
221   PetscOptionsHeadEnd();
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
226 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
227 {
228   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
229   PetscInt                      m  = A->rmap->n;
230   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
231   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
232   const MatScalar              *Aa = a->a;
233   PetscInt                     *Mi, *Mj, Mnz;
234   PetscScalar                  *Ma;
235 
236   PetscFunctionBegin;
237   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
238     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
239       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
240       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
241       PetscCall(PetscMalloc1(m + 1, &Mi));
242       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
243       PetscCall(PetscMalloc1(Mnz, &Ma));
244       Mi[0] = 0;
245       for (PetscInt i = 0; i < m; i++) {
246         PetscInt llen = Ai[i + 1] - Ai[i];
247         PetscInt ulen = Adiag[i] - Adiag[i + 1];
248         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
249         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
250         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
251         Mi[i + 1] = Mi[i] + llen + ulen;
252       }
253       // Copy M (L,U) from host to device
254       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
255       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
256       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
257       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
258       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
259 
260       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
261       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
262       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
263       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
264       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
265       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
266       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
267       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
268 
269       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
270       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
271       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
272 
273       fillMode = CUSPARSE_FILL_MODE_UPPER;
274       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
275       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
276       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
277       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
278 
279       // Allocate work vectors in SpSv
280       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
281       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
282 
283       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
284       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
285 
286       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
289       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
290       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
291       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
292       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
293 
294       // Record for reuse
295       fs->csrRowPtr_h = Mi;
296       fs->csrVal_h    = Ma;
297       PetscCall(PetscFree(Mj));
298     }
299     // Copy the value
300     Mi  = fs->csrRowPtr_h;
301     Ma  = fs->csrVal_h;
302     Mnz = Mi[m];
303     for (PetscInt i = 0; i < m; i++) {
304       PetscInt llen = Ai[i + 1] - Ai[i];
305       PetscInt ulen = Adiag[i] - Adiag[i + 1];
306       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
307       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
308       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
309     }
310     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
311 
312   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
313     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
314       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
315       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
316       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317     } else
318   #endif
319     {
320       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
321       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
322 
323       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
324       fs->updatedSpSVAnalysis          = PETSC_TRUE;
325       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
326     }
327   }
328   PetscFunctionReturn(PETSC_SUCCESS);
329 }
330 #else
331 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
332 {
333   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
334   PetscInt                           n                  = A->rmap->n;
335   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
336   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
337   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
338   const MatScalar                   *aa = a->a, *v;
339   PetscInt                          *AiLo, *AjLo;
340   PetscInt                           i, nz, nzLower, offset, rowOffset;
341 
342   PetscFunctionBegin;
343   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
344   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
345     try {
346       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
347       nzLower = n + ai[n] - ai[1];
348       if (!loTriFactor) {
349         PetscScalar *AALo;
350 
351         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
352 
353         /* Allocate Space for the lower triangular matrix */
354         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
355         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
356 
357         /* Fill the lower triangular matrix */
358         AiLo[0]   = (PetscInt)0;
359         AiLo[n]   = nzLower;
360         AjLo[0]   = (PetscInt)0;
361         AALo[0]   = (MatScalar)1.0;
362         v         = aa;
363         vi        = aj;
364         offset    = 1;
365         rowOffset = 1;
366         for (i = 1; i < n; i++) {
367           nz = ai[i + 1] - ai[i];
368           /* additional 1 for the term on the diagonal */
369           AiLo[i] = rowOffset;
370           rowOffset += nz + 1;
371 
372           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
373           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
374 
375           offset += nz;
376           AjLo[offset] = (PetscInt)i;
377           AALo[offset] = (MatScalar)1.0;
378           offset += 1;
379 
380           v += nz;
381           vi += nz;
382         }
383 
384         /* allocate space for the triangular factor information */
385         PetscCall(PetscNew(&loTriFactor));
386         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
387         /* Create the matrix description */
388         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
389         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
390   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
391         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
392   #else
393         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
394   #endif
395         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
396         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
397 
398         /* set the operation */
399         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
400 
401         /* set the matrix */
402         loTriFactor->csrMat              = new CsrMatrix;
403         loTriFactor->csrMat->num_rows    = n;
404         loTriFactor->csrMat->num_cols    = n;
405         loTriFactor->csrMat->num_entries = nzLower;
406 
407         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
408         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
409 
410         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
411         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
412 
413         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
414         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
415 
416         /* Create the solve analysis information */
417         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
418         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
419   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
420         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
421                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
422         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
423   #endif
424 
425         /* perform the solve analysis */
426         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
427                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
428         PetscCallCUDA(WaitForCUDA());
429         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
430 
431         /* assign the pointer */
432         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
433         loTriFactor->AA_h                                          = AALo;
434         PetscCallCUDA(cudaFreeHost(AiLo));
435         PetscCallCUDA(cudaFreeHost(AjLo));
436         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
437       } else { /* update values only */
438         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
439         /* Fill the lower triangular matrix */
440         loTriFactor->AA_h[0] = 1.0;
441         v                    = aa;
442         vi                   = aj;
443         offset               = 1;
444         for (i = 1; i < n; i++) {
445           nz = ai[i + 1] - ai[i];
446           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
447           offset += nz;
448           loTriFactor->AA_h[offset] = 1.0;
449           offset += 1;
450           v += nz;
451         }
452         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
453         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
454       }
455     } catch (char *ex) {
456       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
457     }
458   }
459   PetscFunctionReturn(PETSC_SUCCESS);
460 }
461 
462 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
463 {
464   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
465   PetscInt                           n                  = A->rmap->n;
466   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
467   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
468   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
469   const MatScalar                   *aa = a->a, *v;
470   PetscInt                          *AiUp, *AjUp;
471   PetscInt                           i, nz, nzUpper, offset;
472 
473   PetscFunctionBegin;
474   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
475   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
476     try {
477       /* next, figure out the number of nonzeros in the upper triangular matrix. */
478       nzUpper = adiag[0] - adiag[n];
479       if (!upTriFactor) {
480         PetscScalar *AAUp;
481 
482         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
483 
484         /* Allocate Space for the upper triangular matrix */
485         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
486         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
487 
488         /* Fill the upper triangular matrix */
489         AiUp[0] = (PetscInt)0;
490         AiUp[n] = nzUpper;
491         offset  = nzUpper;
492         for (i = n - 1; i >= 0; i--) {
493           v  = aa + adiag[i + 1] + 1;
494           vi = aj + adiag[i + 1] + 1;
495 
496           /* number of elements NOT on the diagonal */
497           nz = adiag[i] - adiag[i + 1] - 1;
498 
499           /* decrement the offset */
500           offset -= (nz + 1);
501 
502           /* first, set the diagonal elements */
503           AjUp[offset] = (PetscInt)i;
504           AAUp[offset] = (MatScalar)1. / v[nz];
505           AiUp[i]      = AiUp[i + 1] - (nz + 1);
506 
507           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
508           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
509         }
510 
511         /* allocate space for the triangular factor information */
512         PetscCall(PetscNew(&upTriFactor));
513         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
514 
515         /* Create the matrix description */
516         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
517         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
518   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
519         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
520   #else
521         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
522   #endif
523         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
524         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
525 
526         /* set the operation */
527         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
528 
529         /* set the matrix */
530         upTriFactor->csrMat              = new CsrMatrix;
531         upTriFactor->csrMat->num_rows    = n;
532         upTriFactor->csrMat->num_cols    = n;
533         upTriFactor->csrMat->num_entries = nzUpper;
534 
535         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
536         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
537 
538         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
539         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
540 
541         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
542         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
543 
544         /* Create the solve analysis information */
545         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
546         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
547   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
548         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
549                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
550         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
551   #endif
552 
553         /* perform the solve analysis */
554         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
555                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
556 
557         PetscCallCUDA(WaitForCUDA());
558         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
559 
560         /* assign the pointer */
561         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
562         upTriFactor->AA_h                                          = AAUp;
563         PetscCallCUDA(cudaFreeHost(AiUp));
564         PetscCallCUDA(cudaFreeHost(AjUp));
565         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
566       } else {
567         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
568         /* Fill the upper triangular matrix */
569         offset = nzUpper;
570         for (i = n - 1; i >= 0; i--) {
571           v = aa + adiag[i + 1] + 1;
572 
573           /* number of elements NOT on the diagonal */
574           nz = adiag[i] - adiag[i + 1] - 1;
575 
576           /* decrement the offset */
577           offset -= (nz + 1);
578 
579           /* first, set the diagonal elements */
580           upTriFactor->AA_h[offset] = 1. / v[nz];
581           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
582         }
583         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
584         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
585       }
586     } catch (char *ex) {
587       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
588     }
589   }
590   PetscFunctionReturn(PETSC_SUCCESS);
591 }
592 #endif
593 
594 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
595 {
596   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
597   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
598   IS                            isrow = a->row, isicol = a->icol;
599   PetscBool                     row_identity, col_identity;
600   PetscInt                      n = A->rmap->n;
601 
602   PetscFunctionBegin;
603   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
604 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
605   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
606 #else
607   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
608   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
609   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
610 #endif
611 
612   cusparseTriFactors->nnz = a->nz;
613 
614   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
615   /* lower triangular indices */
616   PetscCall(ISIdentity(isrow, &row_identity));
617   if (!row_identity && !cusparseTriFactors->rpermIndices) {
618     const PetscInt *r;
619 
620     PetscCall(ISGetIndices(isrow, &r));
621     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
622     cusparseTriFactors->rpermIndices->assign(r, r + n);
623     PetscCall(ISRestoreIndices(isrow, &r));
624     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
625   }
626 
627   /* upper triangular indices */
628   PetscCall(ISIdentity(isicol, &col_identity));
629   if (!col_identity && !cusparseTriFactors->cpermIndices) {
630     const PetscInt *c;
631 
632     PetscCall(ISGetIndices(isicol, &c));
633     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
634     cusparseTriFactors->cpermIndices->assign(c, c + n);
635     PetscCall(ISRestoreIndices(isicol, &c));
636     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
637   }
638   PetscFunctionReturn(PETSC_SUCCESS);
639 }
640 
641 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
642 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
643 {
644   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
645   PetscInt                      m  = A->rmap->n;
646   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
647   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
648   const MatScalar              *Aa = a->a;
649   PetscInt                     *Mj, Mnz;
650   PetscScalar                  *Ma, *D;
651 
652   PetscFunctionBegin;
653   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
654     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
655       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
656       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
657       Mnz = Ai[m]; // Unz (with the unit diagonal)
658       PetscCall(PetscMalloc1(Mnz, &Ma));
659       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
660       PetscCall(PetscMalloc1(m, &D));    // the diagonal
661       for (PetscInt i = 0; i < m; i++) {
662         PetscInt ulen = Ai[i + 1] - Ai[i];
663         Mj[Ai[i]]     = i;                                              // diagonal entry
664         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
665       }
666       // Copy M (U) from host to device
667       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
668       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
669       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
670       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
671       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
672       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
673 
674       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
675       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
676       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
677       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
678       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
679       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
680       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
681       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
682 
683       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
684       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
685       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
686 
687       // Allocate work vectors in SpSv
688       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
689       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
690 
691       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
692       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
693 
694       // Query buffer sizes for SpSV and then allocate buffers
695       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
696       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
697       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
698 
699       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
700       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
701       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
702 
703       // Record for reuse
704       fs->csrVal_h = Ma;
705       fs->diag_h   = D;
706       PetscCall(PetscFree(Mj));
707     }
708     // Copy the value
709     Ma  = fs->csrVal_h;
710     D   = fs->diag_h;
711     Mnz = Ai[m];
712     for (PetscInt i = 0; i < m; i++) {
713       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
714       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
715       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
716     }
717     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
718     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
719 
720   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
721     if (fs->updatedSpSVAnalysis) {
722       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
723       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724     } else
725   #endif
726     {
727       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
728       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
729       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
730       fs->updatedSpSVAnalysis = PETSC_TRUE;
731     }
732   }
733   PetscFunctionReturn(PETSC_SUCCESS);
734 }
735 
736 // Solve Ut D U x = b
737 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
738 {
739   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
740   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
741   const PetscScalar                    *barray;
742   PetscScalar                          *xarray;
743   thrust::device_ptr<const PetscScalar> bGPU;
744   thrust::device_ptr<PetscScalar>       xGPU;
745   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
746   PetscInt                              m   = A->rmap->n;
747 
748   PetscFunctionBegin;
749   PetscCall(PetscLogGpuTimeBegin());
750   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
751   PetscCall(VecCUDAGetArrayRead(b, &barray));
752   xGPU = thrust::device_pointer_cast(xarray);
753   bGPU = thrust::device_pointer_cast(barray);
754 
755   // Reorder b with the row permutation if needed, and wrap the result in fs->X
756   if (fs->rpermIndices) {
757     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
758     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
759   } else {
760     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
761   }
762 
763   // Solve Ut Y = X
764   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
765   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
766 
767   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
768   // It is basically a vector element-wise multiplication, but cublas does not have it!
769   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
770 
771   // Solve U X = Y
772   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
773     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
774   } else {
775     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
776   }
777   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
778 
779   // Reorder X with the column permutation if needed, and put the result back to x
780   if (fs->cpermIndices) {
781     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
782                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
783   }
784 
785   PetscCall(VecCUDARestoreArrayRead(b, &barray));
786   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
787   PetscCall(PetscLogGpuTimeEnd());
788   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
789   PetscFunctionReturn(PETSC_SUCCESS);
790 }
791 #else
792 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
793 {
794   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
795   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
796   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
797   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
798   PetscInt                          *AiUp, *AjUp;
799   PetscScalar                       *AAUp;
800   PetscScalar                       *AALo;
801   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
802   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
803   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
804   const MatScalar                   *aa = b->a, *v;
805 
806   PetscFunctionBegin;
807   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
808   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
809     try {
810       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
811       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
812       if (!upTriFactor && !loTriFactor) {
813         /* Allocate Space for the upper triangular matrix */
814         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
815         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
816 
817         /* Fill the upper triangular matrix */
818         AiUp[0] = (PetscInt)0;
819         AiUp[n] = nzUpper;
820         offset  = 0;
821         for (i = 0; i < n; i++) {
822           /* set the pointers */
823           v  = aa + ai[i];
824           vj = aj + ai[i];
825           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
826 
827           /* first, set the diagonal elements */
828           AjUp[offset] = (PetscInt)i;
829           AAUp[offset] = (MatScalar)1.0 / v[nz];
830           AiUp[i]      = offset;
831           AALo[offset] = (MatScalar)1.0 / v[nz];
832 
833           offset += 1;
834           if (nz > 0) {
835             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
836             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
837             for (j = offset; j < offset + nz; j++) {
838               AAUp[j] = -AAUp[j];
839               AALo[j] = AAUp[j] / v[nz];
840             }
841             offset += nz;
842           }
843         }
844 
845         /* allocate space for the triangular factor information */
846         PetscCall(PetscNew(&upTriFactor));
847         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
848 
849         /* Create the matrix description */
850         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
851         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
852   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
853         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
854   #else
855         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
856   #endif
857         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
858         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
859 
860         /* set the matrix */
861         upTriFactor->csrMat              = new CsrMatrix;
862         upTriFactor->csrMat->num_rows    = A->rmap->n;
863         upTriFactor->csrMat->num_cols    = A->cmap->n;
864         upTriFactor->csrMat->num_entries = a->nz;
865 
866         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
867         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
868 
869         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
870         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
871 
872         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
873         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
874 
875         /* set the operation */
876         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
877 
878         /* Create the solve analysis information */
879         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
880         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
881   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
882         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
883                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
884         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
885   #endif
886 
887         /* perform the solve analysis */
888         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
889                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
890 
891         PetscCallCUDA(WaitForCUDA());
892         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
893 
894         /* assign the pointer */
895         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
896 
897         /* allocate space for the triangular factor information */
898         PetscCall(PetscNew(&loTriFactor));
899         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
900 
901         /* Create the matrix description */
902         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
903         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
904   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
905         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
906   #else
907         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
908   #endif
909         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
910         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
911 
912         /* set the operation */
913         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
914 
915         /* set the matrix */
916         loTriFactor->csrMat              = new CsrMatrix;
917         loTriFactor->csrMat->num_rows    = A->rmap->n;
918         loTriFactor->csrMat->num_cols    = A->cmap->n;
919         loTriFactor->csrMat->num_entries = a->nz;
920 
921         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
922         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
923 
924         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
925         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
926 
927         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
928         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
929 
930         /* Create the solve analysis information */
931         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
932         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
933   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
934         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
935                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
936         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
937   #endif
938 
939         /* perform the solve analysis */
940         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
941                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
942 
943         PetscCallCUDA(WaitForCUDA());
944         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
945 
946         /* assign the pointer */
947         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
948 
949         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
950         PetscCallCUDA(cudaFreeHost(AiUp));
951         PetscCallCUDA(cudaFreeHost(AjUp));
952       } else {
953         /* Fill the upper triangular matrix */
954         offset = 0;
955         for (i = 0; i < n; i++) {
956           /* set the pointers */
957           v  = aa + ai[i];
958           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
959 
960           /* first, set the diagonal elements */
961           AAUp[offset] = 1.0 / v[nz];
962           AALo[offset] = 1.0 / v[nz];
963 
964           offset += 1;
965           if (nz > 0) {
966             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
967             for (j = offset; j < offset + nz; j++) {
968               AAUp[j] = -AAUp[j];
969               AALo[j] = AAUp[j] / v[nz];
970             }
971             offset += nz;
972           }
973         }
974         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
975         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
976         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
977         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
978         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
979       }
980       PetscCallCUDA(cudaFreeHost(AAUp));
981       PetscCallCUDA(cudaFreeHost(AALo));
982     } catch (char *ex) {
983       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
984     }
985   }
986   PetscFunctionReturn(PETSC_SUCCESS);
987 }
988 #endif
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
991 {
992   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
993   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
994   IS                            ip                 = a->row;
995   PetscBool                     perm_identity;
996   PetscInt                      n = A->rmap->n;
997 
998   PetscFunctionBegin;
999   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
1000 
1001 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1002   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1003 #else
1004   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1005   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1006 #endif
1007   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1008 
1009   A->offloadmask = PETSC_OFFLOAD_BOTH;
1010 
1011   /* lower triangular indices */
1012   PetscCall(ISIdentity(ip, &perm_identity));
1013   if (!perm_identity) {
1014     IS              iip;
1015     const PetscInt *irip, *rip;
1016 
1017     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1018     PetscCall(ISGetIndices(iip, &irip));
1019     PetscCall(ISGetIndices(ip, &rip));
1020     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1021     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1022     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1023     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1024     PetscCall(ISRestoreIndices(iip, &irip));
1025     PetscCall(ISDestroy(&iip));
1026     PetscCall(ISRestoreIndices(ip, &rip));
1027     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1028   }
1029   PetscFunctionReturn(PETSC_SUCCESS);
1030 }
1031 
1032 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1033 {
1034   PetscFunctionBegin;
1035   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1036   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1037   B->offloadmask = PETSC_OFFLOAD_CPU;
1038 
1039 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1040   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1041   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042 #else
1043   /* determine which version of MatSolve needs to be used. */
1044   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1045   IS          ip = b->row;
1046   PetscBool   perm_identity;
1047 
1048   PetscCall(ISIdentity(ip, &perm_identity));
1049   if (perm_identity) {
1050     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1051     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1052   } else {
1053     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1054     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1055   }
1056 #endif
1057   B->ops->matsolve          = NULL;
1058   B->ops->matsolvetranspose = NULL;
1059 
1060   /* get the triangular factors */
1061   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1062   PetscFunctionReturn(PETSC_SUCCESS);
1063 }
1064 
1065 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1066 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1067 {
1068   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1070   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1071   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1072   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1073   cusparseIndexBase_t                indexBase;
1074   cusparseMatrixType_t               matrixType;
1075   cusparseFillMode_t                 fillMode;
1076   cusparseDiagType_t                 diagType;
1077 
1078   PetscFunctionBegin;
1079   /* allocate space for the transpose of the lower triangular factor */
1080   PetscCall(PetscNew(&loTriFactorT));
1081   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1082 
1083   /* set the matrix descriptors of the lower triangular factor */
1084   matrixType = cusparseGetMatType(loTriFactor->descr);
1085   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1086   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1087   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1088 
1089   /* Create the matrix description */
1090   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1091   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1092   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1093   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1094   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1095 
1096   /* set the operation */
1097   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1098 
1099   /* allocate GPU space for the CSC of the lower triangular factor*/
1100   loTriFactorT->csrMat                 = new CsrMatrix;
1101   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1102   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1103   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1104   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1105   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1106   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1107 
1108   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1109   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1110   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1111                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1112                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1113   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1114   #endif
1115 
1116   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1117   {
1118     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1119     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1120                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1121   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1122                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1123   #else
1124                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1125   #endif
1126     PetscCallCUSPARSE(stat);
1127   }
1128 
1129   PetscCallCUDA(WaitForCUDA());
1130   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1131 
1132   /* Create the solve analysis information */
1133   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1134   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1135   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1136   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1137                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1138   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1139   #endif
1140 
1141   /* perform the solve analysis */
1142   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1143                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1144 
1145   PetscCallCUDA(WaitForCUDA());
1146   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1147 
1148   /* assign the pointer */
1149   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1150 
1151   /*********************************************/
1152   /* Now the Transpose of the Upper Tri Factor */
1153   /*********************************************/
1154 
1155   /* allocate space for the transpose of the upper triangular factor */
1156   PetscCall(PetscNew(&upTriFactorT));
1157   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1158 
1159   /* set the matrix descriptors of the upper triangular factor */
1160   matrixType = cusparseGetMatType(upTriFactor->descr);
1161   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1162   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1163   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1164 
1165   /* Create the matrix description */
1166   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1167   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1168   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1169   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1170   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1171 
1172   /* set the operation */
1173   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1174 
1175   /* allocate GPU space for the CSC of the upper triangular factor*/
1176   upTriFactorT->csrMat                 = new CsrMatrix;
1177   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1178   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1179   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1180   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1181   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1182   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1183 
1184   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1185   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1186   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1187                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1188                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1189   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1190   #endif
1191 
1192   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1193   {
1194     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1195     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1196                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1197   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1198                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1199   #else
1200                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1201   #endif
1202     PetscCallCUSPARSE(stat);
1203   }
1204 
1205   PetscCallCUDA(WaitForCUDA());
1206   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1207 
1208   /* Create the solve analysis information */
1209   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1210   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1211   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1212   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1213                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1214   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1215   #endif
1216 
1217   /* perform the solve analysis */
1218   /* christ, would it have killed you to put this stuff in a function????????? */
1219   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1220                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1221 
1222   PetscCallCUDA(WaitForCUDA());
1223   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1224 
1225   /* assign the pointer */
1226   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227   PetscFunctionReturn(PETSC_SUCCESS);
1228 }
1229 #endif
1230 
1231 struct PetscScalarToPetscInt {
1232   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1233 };
1234 
1235 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1236 {
1237   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1238   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1239   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1240   cusparseStatus_t              stat;
1241   cusparseIndexBase_t           indexBase;
1242 
1243   PetscFunctionBegin;
1244   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1245   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1246   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1247   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1248   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1249   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1250   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1251   PetscCall(PetscLogGpuTimeBegin());
1252   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1253   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1254     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1255     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1256     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1257     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1258     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1259 
1260     /* set alpha and beta */
1261     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1262     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1263     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1264     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1265     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1267 
1268     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1269       CsrMatrix *matrixT      = new CsrMatrix;
1270       matstructT->mat         = matrixT;
1271       matrixT->num_rows       = A->cmap->n;
1272       matrixT->num_cols       = A->rmap->n;
1273       matrixT->num_entries    = a->nz;
1274       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1275       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1276       matrixT->values         = new THRUSTARRAY(a->nz);
1277 
1278       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1279       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1280 
1281 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1282   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1283       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1284                                indexBase, cusparse_scalartype);
1285       PetscCallCUSPARSE(stat);
1286   #else
1287       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1288            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1289 
1290            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1291            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1292            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1293         */
1294       if (matrixT->num_entries) {
1295         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1296         PetscCallCUSPARSE(stat);
1297 
1298       } else {
1299         matstructT->matDescr = NULL;
1300         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1301       }
1302   #endif
1303 #endif
1304     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1305 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1306       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1307 #else
1308       CsrMatrix *temp  = new CsrMatrix;
1309       CsrMatrix *tempT = new CsrMatrix;
1310       /* First convert HYB to CSR */
1311       temp->num_rows       = A->rmap->n;
1312       temp->num_cols       = A->cmap->n;
1313       temp->num_entries    = a->nz;
1314       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1315       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1316       temp->values         = new THRUSTARRAY(a->nz);
1317 
1318       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1322       tempT->num_rows       = A->rmap->n;
1323       tempT->num_cols       = A->cmap->n;
1324       tempT->num_entries    = a->nz;
1325       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1326       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1327       tempT->values         = new THRUSTARRAY(a->nz);
1328 
1329       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1330                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1331       PetscCallCUSPARSE(stat);
1332 
1333       /* Last, convert CSC to HYB */
1334       cusparseHybMat_t hybMat;
1335       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1336       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1337       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1338       PetscCallCUSPARSE(stat);
1339 
1340       /* assign the pointer */
1341       matstructT->mat = hybMat;
1342       A->transupdated = PETSC_TRUE;
1343       /* delete temporaries */
1344       if (tempT) {
1345         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1346         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1347         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1348         delete (CsrMatrix *)tempT;
1349       }
1350       if (temp) {
1351         if (temp->values) delete (THRUSTARRAY *)temp->values;
1352         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1353         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1354         delete (CsrMatrix *)temp;
1355       }
1356 #endif
1357     }
1358   }
1359   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1360     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1361     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1362     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1363     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1364     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1365     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1366     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1367     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1368     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1369     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1370     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1371       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1372       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1373       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1374     }
1375     if (!cusparsestruct->csr2csc_i) {
1376       THRUSTARRAY csr2csc_a(matrix->num_entries);
1377       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1378 
1379       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381       void  *csr2cscBuffer;
1382       size_t csr2cscBufferSize;
1383       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1384                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1385       PetscCallCUSPARSE(stat);
1386       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1387 #endif
1388 
1389       if (matrix->num_entries) {
1390         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1391            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1392            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1393 
1394            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1395            should be filled with indexBase. So I just take a shortcut here.
1396         */
1397         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1398 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1399                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1400         PetscCallCUSPARSE(stat);
1401 #else
1402                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1403         PetscCallCUSPARSE(stat);
1404 #endif
1405       } else {
1406         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1407       }
1408 
1409       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1410       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1411 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1412       PetscCallCUDA(cudaFree(csr2cscBuffer));
1413 #endif
1414     }
1415     PetscCallThrust(
1416       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1417   }
1418   PetscCall(PetscLogGpuTimeEnd());
1419   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1420   /* the compressed row indices is not used for matTranspose */
1421   matstructT->cprowIndices = NULL;
1422   /* assign the pointer */
1423   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1424   A->transupdated                                = PETSC_TRUE;
1425   PetscFunctionReturn(PETSC_SUCCESS);
1426 }
1427 
1428 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1429 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1430 {
1431   const PetscScalar                    *barray;
1432   PetscScalar                          *xarray;
1433   thrust::device_ptr<const PetscScalar> bGPU;
1434   thrust::device_ptr<PetscScalar>       xGPU;
1435   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1436   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1437   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1438   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1439   PetscInt                              m   = A->rmap->n;
1440 
1441   PetscFunctionBegin;
1442   PetscCall(PetscLogGpuTimeBegin());
1443   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1444   PetscCall(VecCUDAGetArrayRead(b, &barray));
1445   xGPU = thrust::device_pointer_cast(xarray);
1446   bGPU = thrust::device_pointer_cast(barray);
1447 
1448   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1449   if (fs->rpermIndices) {
1450     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1451     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1452   } else {
1453     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1454   }
1455 
1456   // Solve L Y = X
1457   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1458   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1459   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1460 
1461   // Solve U X = Y
1462   if (fs->cpermIndices) {
1463     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1464   } else {
1465     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1466   }
1467   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1468 
1469   // Reorder X with the column permutation if needed, and put the result back to x
1470   if (fs->cpermIndices) {
1471     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1472                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1473   }
1474   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1475   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1476   PetscCall(PetscLogGpuTimeEnd());
1477   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1478   PetscFunctionReturn(PETSC_SUCCESS);
1479 }
1480 
1481 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1482 {
1483   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1484   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1485   const PetscScalar                    *barray;
1486   PetscScalar                          *xarray;
1487   thrust::device_ptr<const PetscScalar> bGPU;
1488   thrust::device_ptr<PetscScalar>       xGPU;
1489   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1490   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1491   PetscInt                              m   = A->rmap->n;
1492 
1493   PetscFunctionBegin;
1494   PetscCall(PetscLogGpuTimeBegin());
1495   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1496     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1497     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1498                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1499 
1500     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1501     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1502     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1503     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1504     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1505   }
1506 
1507   if (!fs->updatedTransposeSpSVAnalysis) {
1508     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1509 
1510     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1511     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1512   }
1513 
1514   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1515   PetscCall(VecCUDAGetArrayRead(b, &barray));
1516   xGPU = thrust::device_pointer_cast(xarray);
1517   bGPU = thrust::device_pointer_cast(barray);
1518 
1519   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1520   if (fs->rpermIndices) {
1521     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1522     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1523   } else {
1524     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1525   }
1526 
1527   // Solve Ut Y = X
1528   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1529   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1530 
1531   // Solve Lt X = Y
1532   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1533     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1534   } else {
1535     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1536   }
1537   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1538 
1539   // Reorder X with the column permutation if needed, and put the result back to x
1540   if (fs->cpermIndices) {
1541     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1542                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1543   }
1544 
1545   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1546   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1547   PetscCall(PetscLogGpuTimeEnd());
1548   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1549   PetscFunctionReturn(PETSC_SUCCESS);
1550 }
1551 #else
1552 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1553 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1554 {
1555   PetscInt                              n = xx->map->n;
1556   const PetscScalar                    *barray;
1557   PetscScalar                          *xarray;
1558   thrust::device_ptr<const PetscScalar> bGPU;
1559   thrust::device_ptr<PetscScalar>       xGPU;
1560   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1561   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1562   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1563   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1564 
1565   PetscFunctionBegin;
1566   /* Analyze the matrix and create the transpose ... on the fly */
1567   if (!loTriFactorT && !upTriFactorT) {
1568     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1570     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1571   }
1572 
1573   /* Get the GPU pointers */
1574   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1575   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1576   xGPU = thrust::device_pointer_cast(xarray);
1577   bGPU = thrust::device_pointer_cast(barray);
1578 
1579   PetscCall(PetscLogGpuTimeBegin());
1580   /* First, reorder with the row permutation */
1581   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1582 
1583   /* First, solve U */
1584   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1585                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1586 
1587   /* Then, solve L */
1588   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1589                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1590 
1591   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1592   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1593 
1594   /* Copy the temporary to the full solution. */
1595   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1596 
1597   /* restore */
1598   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1599   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1600   PetscCall(PetscLogGpuTimeEnd());
1601   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1602   PetscFunctionReturn(PETSC_SUCCESS);
1603 }
1604 
1605 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1606 {
1607   const PetscScalar                 *barray;
1608   PetscScalar                       *xarray;
1609   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1610   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1611   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1612   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1613 
1614   PetscFunctionBegin;
1615   /* Analyze the matrix and create the transpose ... on the fly */
1616   if (!loTriFactorT && !upTriFactorT) {
1617     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1618     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1619     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1620   }
1621 
1622   /* Get the GPU pointers */
1623   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1624   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1625 
1626   PetscCall(PetscLogGpuTimeBegin());
1627   /* First, solve U */
1628   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1629                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1630 
1631   /* Then, solve L */
1632   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1633                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1634 
1635   /* restore */
1636   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1637   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1638   PetscCall(PetscLogGpuTimeEnd());
1639   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1640   PetscFunctionReturn(PETSC_SUCCESS);
1641 }
1642 
1643 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1644 {
1645   const PetscScalar                    *barray;
1646   PetscScalar                          *xarray;
1647   thrust::device_ptr<const PetscScalar> bGPU;
1648   thrust::device_ptr<PetscScalar>       xGPU;
1649   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1650   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1651   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1652   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1653 
1654   PetscFunctionBegin;
1655   /* Get the GPU pointers */
1656   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1657   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1658   xGPU = thrust::device_pointer_cast(xarray);
1659   bGPU = thrust::device_pointer_cast(barray);
1660 
1661   PetscCall(PetscLogGpuTimeBegin());
1662   /* First, reorder with the row permutation */
1663   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1664 
1665   /* Next, solve L */
1666   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1667                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1668 
1669   /* Then, solve U */
1670   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1671                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1672 
1673   /* Last, reorder with the column permutation */
1674   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1675 
1676   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1677   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1678   PetscCall(PetscLogGpuTimeEnd());
1679   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1680   PetscFunctionReturn(PETSC_SUCCESS);
1681 }
1682 
1683 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1684 {
1685   const PetscScalar                 *barray;
1686   PetscScalar                       *xarray;
1687   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1688   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1689   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1690   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1691 
1692   PetscFunctionBegin;
1693   /* Get the GPU pointers */
1694   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1695   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1696 
1697   PetscCall(PetscLogGpuTimeBegin());
1698   /* First, solve L */
1699   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1700                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1701 
1702   /* Next, solve U */
1703   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1704                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1705 
1706   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1707   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1708   PetscCall(PetscLogGpuTimeEnd());
1709   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1710   PetscFunctionReturn(PETSC_SUCCESS);
1711 }
1712 #endif
1713 
1714 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1715 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1716 {
1717   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1718   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1719   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1720   CsrMatrix                    *Acsr;
1721   PetscInt                      m, nz;
1722   PetscBool                     flg;
1723 
1724   PetscFunctionBegin;
1725   if (PetscDefined(USE_DEBUG)) {
1726     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1727     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1728   }
1729 
1730   /* Copy A's value to fact */
1731   m  = fact->rmap->n;
1732   nz = aij->nz;
1733   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1734   Acsr = (CsrMatrix *)Acusp->mat->mat;
1735   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1736 
1737   PetscCall(PetscLogGpuTimeBegin());
1738   /* Factorize fact inplace */
1739   if (m)
1740     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1741                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1742   if (PetscDefined(USE_DEBUG)) {
1743     int              numerical_zero;
1744     cusparseStatus_t status;
1745     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1746     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1747   }
1748 
1749   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1750   if (fs->updatedSpSVAnalysis) {
1751     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1752     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753   } else
1754   #endif
1755   {
1756     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1757      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1758     */
1759     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1760 
1761     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1762 
1763     fs->updatedSpSVAnalysis = PETSC_TRUE;
1764     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1765     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1766   }
1767 
1768   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1769   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1770   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1771   fact->ops->matsolve          = NULL;
1772   fact->ops->matsolvetranspose = NULL;
1773   PetscCall(PetscLogGpuTimeEnd());
1774   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1775   PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777 
1778 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1779 {
1780   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1781   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1782   PetscInt                      m, nz;
1783 
1784   PetscFunctionBegin;
1785   if (PetscDefined(USE_DEBUG)) {
1786     PetscInt  i;
1787     PetscBool flg, missing;
1788 
1789     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1790     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1791     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1792     PetscCall(MatMissingDiagonal(A, &missing, &i));
1793     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1794   }
1795 
1796   /* Free the old stale stuff */
1797   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1798 
1799   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1800      but they will not be used. Allocate them just for easy debugging.
1801    */
1802   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1803 
1804   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1805   fact->factortype             = MAT_FACTOR_ILU;
1806   fact->info.factor_mallocs    = 0;
1807   fact->info.fill_ratio_given  = info->fill;
1808   fact->info.fill_ratio_needed = 1.0;
1809 
1810   aij->row = NULL;
1811   aij->col = NULL;
1812 
1813   /* ====================================================================== */
1814   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1815   /* We'll do in-place factorization on fact                                */
1816   /* ====================================================================== */
1817   const int *Ai, *Aj;
1818 
1819   m  = fact->rmap->n;
1820   nz = aij->nz;
1821 
1822   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1823   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1824   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1825   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1826   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1827   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1828 
1829   /* ====================================================================== */
1830   /* Create descriptors for M, L, U                                         */
1831   /* ====================================================================== */
1832   cusparseFillMode_t fillMode;
1833   cusparseDiagType_t diagType;
1834 
1835   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1836   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1837   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1838 
1839   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1840     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1841     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1842     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1843     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1844   */
1845   fillMode = CUSPARSE_FILL_MODE_LOWER;
1846   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1847   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1848   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1849   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1850 
1851   fillMode = CUSPARSE_FILL_MODE_UPPER;
1852   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1853   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1854   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1855   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1856 
1857   /* ========================================================================= */
1858   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1859   /* ========================================================================= */
1860   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1861   if (m)
1862     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1863                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1864 
1865   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1866   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1867 
1868   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1869   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1870 
1871   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1872   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1873 
1874   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1875   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1876 
1877   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1878      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1879      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1880      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1881    */
1882   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1883     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1884     fs->spsvBuffer_L = fs->factBuffer_M;
1885     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1886   } else {
1887     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1888     fs->spsvBuffer_U = fs->factBuffer_M;
1889     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1890   }
1891 
1892   /* ========================================================================== */
1893   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1894   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1895   /* ========================================================================== */
1896   int              structural_zero;
1897   cusparseStatus_t status;
1898 
1899   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1900   if (m)
1901     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1902                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1903   if (PetscDefined(USE_DEBUG)) {
1904     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1905     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1906     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1907   }
1908 
1909   /* Estimate FLOPs of the numeric factorization */
1910   {
1911     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1912     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1913     PetscLogDouble flops = 0.0;
1914 
1915     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1916     Ai    = Aseq->i;
1917     Adiag = Aseq->diag;
1918     for (PetscInt i = 0; i < m; i++) {
1919       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1920         nzRow  = Ai[i + 1] - Ai[i];
1921         nzLeft = Adiag[i] - Ai[i];
1922         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1923           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1924         */
1925         nzLeft = (nzRow - 1) / 2;
1926         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1927       }
1928     }
1929     fs->numericFactFlops = flops;
1930   }
1931   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1932   PetscFunctionReturn(PETSC_SUCCESS);
1933 }
1934 
1935 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1936 {
1937   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1938   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1939   const PetscScalar            *barray;
1940   PetscScalar                  *xarray;
1941 
1942   PetscFunctionBegin;
1943   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1944   PetscCall(VecCUDAGetArrayRead(b, &barray));
1945   PetscCall(PetscLogGpuTimeBegin());
1946 
1947   /* Solve L*y = b */
1948   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1949   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1950   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1951                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1952 
1953   /* Solve Lt*x = y */
1954   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1955   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1956                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1957 
1958   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1959   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1960 
1961   PetscCall(PetscLogGpuTimeEnd());
1962   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1963   PetscFunctionReturn(PETSC_SUCCESS);
1964 }
1965 
1966 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1967 {
1968   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1969   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1970   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1971   CsrMatrix                    *Acsr;
1972   PetscInt                      m, nz;
1973   PetscBool                     flg;
1974 
1975   PetscFunctionBegin;
1976   if (PetscDefined(USE_DEBUG)) {
1977     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1978     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1979   }
1980 
1981   /* Copy A's value to fact */
1982   m  = fact->rmap->n;
1983   nz = aij->nz;
1984   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1985   Acsr = (CsrMatrix *)Acusp->mat->mat;
1986   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1987 
1988   /* Factorize fact inplace */
1989   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1990      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1991      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1992      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1993      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1994    */
1995   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1996   if (PetscDefined(USE_DEBUG)) {
1997     int              numerical_zero;
1998     cusparseStatus_t status;
1999     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2000     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2001   }
2002 
2003   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2004   if (fs->updatedSpSVAnalysis) {
2005     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2006     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007   } else
2008   #endif
2009   {
2010     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2011 
2012     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2013     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2014   */
2015     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2016     fs->updatedSpSVAnalysis = PETSC_TRUE;
2017   }
2018 
2019   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2020   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2021   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2022   fact->ops->matsolve          = NULL;
2023   fact->ops->matsolvetranspose = NULL;
2024   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2025   PetscFunctionReturn(PETSC_SUCCESS);
2026 }
2027 
2028 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2029 {
2030   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2031   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2032   PetscInt                      m, nz;
2033 
2034   PetscFunctionBegin;
2035   if (PetscDefined(USE_DEBUG)) {
2036     PetscInt  i;
2037     PetscBool flg, missing;
2038 
2039     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2040     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2041     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2042     PetscCall(MatMissingDiagonal(A, &missing, &i));
2043     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2044   }
2045 
2046   /* Free the old stale stuff */
2047   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2048 
2049   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2050      but they will not be used. Allocate them just for easy debugging.
2051    */
2052   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2053 
2054   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2055   fact->factortype             = MAT_FACTOR_ICC;
2056   fact->info.factor_mallocs    = 0;
2057   fact->info.fill_ratio_given  = info->fill;
2058   fact->info.fill_ratio_needed = 1.0;
2059 
2060   aij->row = NULL;
2061   aij->col = NULL;
2062 
2063   /* ====================================================================== */
2064   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2065   /* We'll do in-place factorization on fact                                */
2066   /* ====================================================================== */
2067   const int *Ai, *Aj;
2068 
2069   m  = fact->rmap->n;
2070   nz = aij->nz;
2071 
2072   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2073   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2074   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2075   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2076   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2077   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2078 
2079   /* ====================================================================== */
2080   /* Create mat descriptors for M, L                                        */
2081   /* ====================================================================== */
2082   cusparseFillMode_t fillMode;
2083   cusparseDiagType_t diagType;
2084 
2085   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2086   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2087   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2088 
2089   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2090     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2091     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2092     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2093     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2094   */
2095   fillMode = CUSPARSE_FILL_MODE_LOWER;
2096   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2097   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2098   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2099   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2100 
2101   /* ========================================================================= */
2102   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2103   /* ========================================================================= */
2104   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2105   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2106 
2107   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2108   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2109 
2110   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2111   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2112 
2113   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2114   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2115 
2116   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2117   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2118 
2119   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2120      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2121    */
2122   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2123     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2124     fs->spsvBuffer_L = fs->factBuffer_M;
2125     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2126   } else {
2127     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2128     fs->spsvBuffer_Lt = fs->factBuffer_M;
2129     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2130   }
2131 
2132   /* ========================================================================== */
2133   /* Perform analysis of ic0 on M                                               */
2134   /* The lower triangular part of M has the same sparsity pattern as L          */
2135   /* ========================================================================== */
2136   int              structural_zero;
2137   cusparseStatus_t status;
2138 
2139   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2140   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2141   if (PetscDefined(USE_DEBUG)) {
2142     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2143     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2144     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2145   }
2146 
2147   /* Estimate FLOPs of the numeric factorization */
2148   {
2149     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2150     PetscInt      *Ai, nzRow, nzLeft;
2151     PetscLogDouble flops = 0.0;
2152 
2153     Ai = Aseq->i;
2154     for (PetscInt i = 0; i < m; i++) {
2155       nzRow = Ai[i + 1] - Ai[i];
2156       if (nzRow > 1) {
2157         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2158           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2159         */
2160         nzLeft = (nzRow - 1) / 2;
2161         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2162       }
2163     }
2164     fs->numericFactFlops = flops;
2165   }
2166   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2167   PetscFunctionReturn(PETSC_SUCCESS);
2168 }
2169 #endif
2170 
2171 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2172 {
2173   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2174   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2175 
2176   PetscFunctionBegin;
2177   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2178   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2179   B->offloadmask = PETSC_OFFLOAD_CPU;
2180 
2181   if (!cusparsestruct->use_cpu_solve) {
2182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2183     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2184     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2185 #else
2186     /* determine which version of MatSolve needs to be used. */
2187     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2188     IS          isrow = b->row, iscol = b->col;
2189     PetscBool   row_identity, col_identity;
2190 
2191     PetscCall(ISIdentity(isrow, &row_identity));
2192     PetscCall(ISIdentity(iscol, &col_identity));
2193     if (row_identity && col_identity) {
2194       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2195       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2196     } else {
2197       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2198       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2199     }
2200 #endif
2201   }
2202   B->ops->matsolve          = NULL;
2203   B->ops->matsolvetranspose = NULL;
2204 
2205   /* get the triangular factors */
2206   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2207   PetscFunctionReturn(PETSC_SUCCESS);
2208 }
2209 
2210 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2211 {
2212   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2213 
2214   PetscFunctionBegin;
2215   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2216   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2217   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2218   PetscFunctionReturn(PETSC_SUCCESS);
2219 }
2220 
2221 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2222 {
2223   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2224 
2225   PetscFunctionBegin;
2226 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2227   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2228   if (!info->factoronhost) {
2229     PetscCall(ISIdentity(isrow, &row_identity));
2230     PetscCall(ISIdentity(iscol, &col_identity));
2231   }
2232   if (!info->levels && row_identity && col_identity) {
2233     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2234   } else
2235 #endif
2236   {
2237     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2238     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2239     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2240   }
2241   PetscFunctionReturn(PETSC_SUCCESS);
2242 }
2243 
2244 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2245 {
2246   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2247 
2248   PetscFunctionBegin;
2249 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2250   PetscBool perm_identity = PETSC_FALSE;
2251   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2252   if (!info->levels && perm_identity) {
2253     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2254   } else
2255 #endif
2256   {
2257     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2258     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2259     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2260   }
2261   PetscFunctionReturn(PETSC_SUCCESS);
2262 }
2263 
2264 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2265 {
2266   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2267 
2268   PetscFunctionBegin;
2269   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2270   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2271   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2272   PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274 
2275 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2276 {
2277   PetscFunctionBegin;
2278   *type = MATSOLVERCUSPARSE;
2279   PetscFunctionReturn(PETSC_SUCCESS);
2280 }
2281 
2282 /*MC
2283   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2284   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2285   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2286   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2287   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2288   algorithms are not recommended. This class does NOT support direct solver operations.
2289 
2290   Level: beginner
2291 
2292 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2293           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2294 M*/
2295 
2296 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2297 {
2298   PetscInt n = A->rmap->n;
2299 
2300   PetscFunctionBegin;
2301   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2302   PetscCall(MatSetSizes(*B, n, n, n, n));
2303   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2304   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2305 
2306   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2307   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2308     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2309     if (!A->boundtocpu) {
2310       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2311       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2312     } else {
2313       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2314       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2315     }
2316     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2317     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2318     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2319   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2320     if (!A->boundtocpu) {
2321       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2322       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2323     } else {
2324       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2325       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2326     }
2327     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2328     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2329   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2330 
2331   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2332   (*B)->canuseordering = PETSC_TRUE;
2333   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2334   PetscFunctionReturn(PETSC_SUCCESS);
2335 }
2336 
2337 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2338 {
2339   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2340   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2341 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2342   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2343 #endif
2344 
2345   PetscFunctionBegin;
2346   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2347     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2348     if (A->factortype == MAT_FACTOR_NONE) {
2349       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2350       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2351     }
2352 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2353     else if (fs->csrVal) {
2354       /* We have a factorized matrix on device and are able to copy it to host */
2355       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2356     }
2357 #endif
2358     else
2359       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2360     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2361     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2362     A->offloadmask = PETSC_OFFLOAD_BOTH;
2363   }
2364   PetscFunctionReturn(PETSC_SUCCESS);
2365 }
2366 
2367 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2368 {
2369   PetscFunctionBegin;
2370   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2371   *array = ((Mat_SeqAIJ *)A->data)->a;
2372   PetscFunctionReturn(PETSC_SUCCESS);
2373 }
2374 
2375 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2376 {
2377   PetscFunctionBegin;
2378   A->offloadmask = PETSC_OFFLOAD_CPU;
2379   *array         = NULL;
2380   PetscFunctionReturn(PETSC_SUCCESS);
2381 }
2382 
2383 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2384 {
2385   PetscFunctionBegin;
2386   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2387   *array = ((Mat_SeqAIJ *)A->data)->a;
2388   PetscFunctionReturn(PETSC_SUCCESS);
2389 }
2390 
2391 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2392 {
2393   PetscFunctionBegin;
2394   *array = NULL;
2395   PetscFunctionReturn(PETSC_SUCCESS);
2396 }
2397 
2398 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2399 {
2400   PetscFunctionBegin;
2401   *array = ((Mat_SeqAIJ *)A->data)->a;
2402   PetscFunctionReturn(PETSC_SUCCESS);
2403 }
2404 
2405 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2406 {
2407   PetscFunctionBegin;
2408   A->offloadmask = PETSC_OFFLOAD_CPU;
2409   *array         = NULL;
2410   PetscFunctionReturn(PETSC_SUCCESS);
2411 }
2412 
2413 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2414 {
2415   Mat_SeqAIJCUSPARSE *cusp;
2416   CsrMatrix          *matrix;
2417 
2418   PetscFunctionBegin;
2419   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2420   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2421   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2422   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2423   matrix = (CsrMatrix *)cusp->mat->mat;
2424 
2425   if (i) {
2426 #if !defined(PETSC_USE_64BIT_INDICES)
2427     *i = matrix->row_offsets->data().get();
2428 #else
2429     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2430 #endif
2431   }
2432   if (j) {
2433 #if !defined(PETSC_USE_64BIT_INDICES)
2434     *j = matrix->column_indices->data().get();
2435 #else
2436     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2437 #endif
2438   }
2439   if (a) *a = matrix->values->data().get();
2440   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2441   PetscFunctionReturn(PETSC_SUCCESS);
2442 }
2443 
2444 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2445 {
2446   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2447   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2448   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2449   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2450   cusparseStatus_t              stat;
2451   PetscBool                     both = PETSC_TRUE;
2452 
2453   PetscFunctionBegin;
2454   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2455   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2456     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2457       CsrMatrix *matrix;
2458       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2459 
2460       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2461       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2462       matrix->values->assign(a->a, a->a + a->nz);
2463       PetscCallCUDA(WaitForCUDA());
2464       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2465       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2466       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2467     } else {
2468       PetscInt nnz;
2469       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2470       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2471       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2472       delete cusparsestruct->workVector;
2473       delete cusparsestruct->rowoffsets_gpu;
2474       cusparsestruct->workVector     = NULL;
2475       cusparsestruct->rowoffsets_gpu = NULL;
2476       try {
2477         if (a->compressedrow.use) {
2478           m    = a->compressedrow.nrows;
2479           ii   = a->compressedrow.i;
2480           ridx = a->compressedrow.rindex;
2481         } else {
2482           m    = A->rmap->n;
2483           ii   = a->i;
2484           ridx = NULL;
2485         }
2486         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2487         if (!a->a) {
2488           nnz  = ii[m];
2489           both = PETSC_FALSE;
2490         } else nnz = a->nz;
2491         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2492 
2493         /* create cusparse matrix */
2494         cusparsestruct->nrows = m;
2495         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2496         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2497         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2498         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2499 
2500         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2501         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2502         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2503         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2504         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2506         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2507 
2508         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2509         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2510           /* set the matrix */
2511           CsrMatrix *mat   = new CsrMatrix;
2512           mat->num_rows    = m;
2513           mat->num_cols    = A->cmap->n;
2514           mat->num_entries = nnz;
2515           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2516           mat->row_offsets->assign(ii, ii + m + 1);
2517 
2518           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2519           mat->column_indices->assign(a->j, a->j + nnz);
2520 
2521           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2522           if (a->a) mat->values->assign(a->a, a->a + nnz);
2523 
2524           /* assign the pointer */
2525           matstruct->mat = mat;
2526 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2527           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2528             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2529                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2530             PetscCallCUSPARSE(stat);
2531           }
2532 #endif
2533         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2534 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2535           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2536 #else
2537           CsrMatrix *mat   = new CsrMatrix;
2538           mat->num_rows    = m;
2539           mat->num_cols    = A->cmap->n;
2540           mat->num_entries = nnz;
2541           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2542           mat->row_offsets->assign(ii, ii + m + 1);
2543 
2544           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2545           mat->column_indices->assign(a->j, a->j + nnz);
2546 
2547           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2548           if (a->a) mat->values->assign(a->a, a->a + nnz);
2549 
2550           cusparseHybMat_t hybMat;
2551           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2552           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2553           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2554           PetscCallCUSPARSE(stat);
2555           /* assign the pointer */
2556           matstruct->mat = hybMat;
2557 
2558           if (mat) {
2559             if (mat->values) delete (THRUSTARRAY *)mat->values;
2560             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2561             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2562             delete (CsrMatrix *)mat;
2563           }
2564 #endif
2565         }
2566 
2567         /* assign the compressed row indices */
2568         if (a->compressedrow.use) {
2569           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2570           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2571           matstruct->cprowIndices->assign(ridx, ridx + m);
2572           tmp = m;
2573         } else {
2574           cusparsestruct->workVector = NULL;
2575           matstruct->cprowIndices    = NULL;
2576           tmp                        = 0;
2577         }
2578         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2579 
2580         /* assign the pointer */
2581         cusparsestruct->mat = matstruct;
2582       } catch (char *ex) {
2583         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2584       }
2585       PetscCallCUDA(WaitForCUDA());
2586       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2587       cusparsestruct->nonzerostate = A->nonzerostate;
2588     }
2589     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2590   }
2591   PetscFunctionReturn(PETSC_SUCCESS);
2592 }
2593 
2594 struct VecCUDAPlusEquals {
2595   template <typename Tuple>
2596   __host__ __device__ void operator()(Tuple t)
2597   {
2598     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2599   }
2600 };
2601 
2602 struct VecCUDAEquals {
2603   template <typename Tuple>
2604   __host__ __device__ void operator()(Tuple t)
2605   {
2606     thrust::get<1>(t) = thrust::get<0>(t);
2607   }
2608 };
2609 
2610 struct VecCUDAEqualsReverse {
2611   template <typename Tuple>
2612   __host__ __device__ void operator()(Tuple t)
2613   {
2614     thrust::get<0>(t) = thrust::get<1>(t);
2615   }
2616 };
2617 
2618 struct MatMatCusparse {
2619   PetscBool      cisdense;
2620   PetscScalar   *Bt;
2621   Mat            X;
2622   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2623   PetscLogDouble flops;
2624   CsrMatrix     *Bcsr;
2625 
2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2627   cusparseSpMatDescr_t matSpBDescr;
2628   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2629   cusparseDnMatDescr_t matBDescr;
2630   cusparseDnMatDescr_t matCDescr;
2631   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2632   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2633   void *dBuffer4;
2634   void *dBuffer5;
2635   #endif
2636   size_t                mmBufferSize;
2637   void                 *mmBuffer;
2638   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2639   cusparseSpGEMMDescr_t spgemmDesc;
2640 #endif
2641 };
2642 
2643 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2644 {
2645   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2646 
2647   PetscFunctionBegin;
2648   PetscCallCUDA(cudaFree(mmdata->Bt));
2649   delete mmdata->Bcsr;
2650 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2651   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2652   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2653   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2654   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2655   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2656   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2657   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2658   #endif
2659   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2660   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2661 #endif
2662   PetscCall(MatDestroy(&mmdata->X));
2663   PetscCall(PetscFree(data));
2664   PetscFunctionReturn(PETSC_SUCCESS);
2665 }
2666 
2667 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2668 
2669 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2670 {
2671   Mat_Product                  *product = C->product;
2672   Mat                           A, B;
2673   PetscInt                      m, n, blda, clda;
2674   PetscBool                     flg, biscuda;
2675   Mat_SeqAIJCUSPARSE           *cusp;
2676   cusparseStatus_t              stat;
2677   cusparseOperation_t           opA;
2678   const PetscScalar            *barray;
2679   PetscScalar                  *carray;
2680   MatMatCusparse               *mmdata;
2681   Mat_SeqAIJCUSPARSEMultStruct *mat;
2682   CsrMatrix                    *csrmat;
2683 
2684   PetscFunctionBegin;
2685   MatCheckProduct(C, 1);
2686   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2687   mmdata = (MatMatCusparse *)product->data;
2688   A      = product->A;
2689   B      = product->B;
2690   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2691   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2692   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2693      Instead of silently accepting the wrong answer, I prefer to raise the error */
2694   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2695   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2696   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2697   switch (product->type) {
2698   case MATPRODUCT_AB:
2699   case MATPRODUCT_PtAP:
2700     mat = cusp->mat;
2701     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2702     m   = A->rmap->n;
2703     n   = B->cmap->n;
2704     break;
2705   case MATPRODUCT_AtB:
2706     if (!A->form_explicit_transpose) {
2707       mat = cusp->mat;
2708       opA = CUSPARSE_OPERATION_TRANSPOSE;
2709     } else {
2710       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2711       mat = cusp->matTranspose;
2712       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2713     }
2714     m = A->cmap->n;
2715     n = B->cmap->n;
2716     break;
2717   case MATPRODUCT_ABt:
2718   case MATPRODUCT_RARt:
2719     mat = cusp->mat;
2720     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2721     m   = A->rmap->n;
2722     n   = B->rmap->n;
2723     break;
2724   default:
2725     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2726   }
2727   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2728   csrmat = (CsrMatrix *)mat->mat;
2729   /* if the user passed a CPU matrix, copy the data to the GPU */
2730   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2731   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2732   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2733 
2734   PetscCall(MatDenseGetLDA(B, &blda));
2735   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2736     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2737     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2738   } else {
2739     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2740     PetscCall(MatDenseGetLDA(C, &clda));
2741   }
2742 
2743   PetscCall(PetscLogGpuTimeBegin());
2744 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2745   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2746   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2747   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2748   #else
2749   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2750   #endif
2751 
2752   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2753   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2754     size_t mmBufferSize;
2755     if (mmdata->initialized && mmdata->Blda != blda) {
2756       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2757       mmdata->matBDescr = NULL;
2758     }
2759     if (!mmdata->matBDescr) {
2760       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2761       mmdata->Blda = blda;
2762     }
2763 
2764     if (mmdata->initialized && mmdata->Clda != clda) {
2765       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2766       mmdata->matCDescr = NULL;
2767     }
2768     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2769       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2770       mmdata->Clda = clda;
2771     }
2772 
2773   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2774     if (matADescr) {
2775       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2776       matADescr = NULL;
2777     }
2778   #endif
2779 
2780     if (!matADescr) {
2781       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2782                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2783       PetscCallCUSPARSE(stat);
2784     }
2785 
2786     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2787 
2788     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2789       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2790       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2791       mmdata->mmBufferSize = mmBufferSize;
2792     }
2793 
2794   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2795     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2796   #endif
2797 
2798     mmdata->initialized = PETSC_TRUE;
2799   } else {
2800     /* to be safe, always update pointers of the mats */
2801     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2802     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2803     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2804   }
2805 
2806   /* do cusparseSpMM, which supports transpose on B */
2807   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2808 #else
2809   PetscInt k;
2810   /* cusparseXcsrmm does not support transpose on B */
2811   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2812     cublasHandle_t cublasv2handle;
2813     cublasStatus_t cerr;
2814 
2815     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2816     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2817     PetscCallCUBLAS(cerr);
2818     blda = B->cmap->n;
2819     k    = B->cmap->n;
2820   } else {
2821     k = B->rmap->n;
2822   }
2823 
2824   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2825   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2826   PetscCallCUSPARSE(stat);
2827 #endif
2828   PetscCall(PetscLogGpuTimeEnd());
2829   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2830   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2831   if (product->type == MATPRODUCT_RARt) {
2832     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2833     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2834   } else if (product->type == MATPRODUCT_PtAP) {
2835     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2836     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2837   } else {
2838     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2839   }
2840   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2841   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2842   PetscFunctionReturn(PETSC_SUCCESS);
2843 }
2844 
2845 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2846 {
2847   Mat_Product        *product = C->product;
2848   Mat                 A, B;
2849   PetscInt            m, n;
2850   PetscBool           cisdense, flg;
2851   MatMatCusparse     *mmdata;
2852   Mat_SeqAIJCUSPARSE *cusp;
2853 
2854   PetscFunctionBegin;
2855   MatCheckProduct(C, 1);
2856   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2857   A = product->A;
2858   B = product->B;
2859   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2860   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2861   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2862   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2863   switch (product->type) {
2864   case MATPRODUCT_AB:
2865     m = A->rmap->n;
2866     n = B->cmap->n;
2867     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2868     break;
2869   case MATPRODUCT_AtB:
2870     m = A->cmap->n;
2871     n = B->cmap->n;
2872     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2873     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2874     break;
2875   case MATPRODUCT_ABt:
2876     m = A->rmap->n;
2877     n = B->rmap->n;
2878     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2879     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2880     break;
2881   case MATPRODUCT_PtAP:
2882     m = B->cmap->n;
2883     n = B->cmap->n;
2884     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2885     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2886     break;
2887   case MATPRODUCT_RARt:
2888     m = B->rmap->n;
2889     n = B->rmap->n;
2890     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2891     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2892     break;
2893   default:
2894     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2895   }
2896   PetscCall(MatSetSizes(C, m, n, m, n));
2897   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2898   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2899   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2900 
2901   /* product data */
2902   PetscCall(PetscNew(&mmdata));
2903   mmdata->cisdense = cisdense;
2904 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2905   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2906   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2907 #endif
2908   /* for these products we need intermediate storage */
2909   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2910     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2911     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2912     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2913       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2914     } else {
2915       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2916     }
2917   }
2918   C->product->data    = mmdata;
2919   C->product->destroy = MatDestroy_MatMatCusparse;
2920 
2921   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2922   PetscFunctionReturn(PETSC_SUCCESS);
2923 }
2924 
2925 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2926 {
2927   Mat_Product                  *product = C->product;
2928   Mat                           A, B;
2929   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2930   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2931   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2932   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2933   PetscBool                     flg;
2934   cusparseStatus_t              stat;
2935   MatProductType                ptype;
2936   MatMatCusparse               *mmdata;
2937 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2938   cusparseSpMatDescr_t BmatSpDescr;
2939 #endif
2940   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2941 
2942   PetscFunctionBegin;
2943   MatCheckProduct(C, 1);
2944   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2945   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2946   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2947   mmdata = (MatMatCusparse *)C->product->data;
2948   A      = product->A;
2949   B      = product->B;
2950   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2951     mmdata->reusesym = PETSC_FALSE;
2952     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2953     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2954     Cmat = Ccusp->mat;
2955     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2956     Ccsr = (CsrMatrix *)Cmat->mat;
2957     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2958     goto finalize;
2959   }
2960   if (!c->nz) goto finalize;
2961   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2962   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2963   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2964   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2965   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2966   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2967   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2968   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2969   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2970   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2971   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2973   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2974   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2975 
2976   ptype = product->type;
2977   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2978     ptype = MATPRODUCT_AB;
2979     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2980   }
2981   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2982     ptype = MATPRODUCT_AB;
2983     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2984   }
2985   switch (ptype) {
2986   case MATPRODUCT_AB:
2987     Amat = Acusp->mat;
2988     Bmat = Bcusp->mat;
2989     break;
2990   case MATPRODUCT_AtB:
2991     Amat = Acusp->matTranspose;
2992     Bmat = Bcusp->mat;
2993     break;
2994   case MATPRODUCT_ABt:
2995     Amat = Acusp->mat;
2996     Bmat = Bcusp->matTranspose;
2997     break;
2998   default:
2999     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3000   }
3001   Cmat = Ccusp->mat;
3002   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3003   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3004   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3005   Acsr = (CsrMatrix *)Amat->mat;
3006   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3007   Ccsr = (CsrMatrix *)Cmat->mat;
3008   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3009   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3010   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3011   PetscCall(PetscLogGpuTimeBegin());
3012 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3013   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3014   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3015   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3016   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3017   PetscCallCUSPARSE(stat);
3018   #else
3019   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3020   PetscCallCUSPARSE(stat);
3021   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3022   PetscCallCUSPARSE(stat);
3023   #endif
3024 #else
3025   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3026                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3027   PetscCallCUSPARSE(stat);
3028 #endif
3029   PetscCall(PetscLogGpuFlops(mmdata->flops));
3030   PetscCallCUDA(WaitForCUDA());
3031   PetscCall(PetscLogGpuTimeEnd());
3032   C->offloadmask = PETSC_OFFLOAD_GPU;
3033 finalize:
3034   /* shorter version of MatAssemblyEnd_SeqAIJ */
3035   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3036   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3037   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3038   c->reallocs = 0;
3039   C->info.mallocs += 0;
3040   C->info.nz_unneeded = 0;
3041   C->assembled = C->was_assembled = PETSC_TRUE;
3042   C->num_ass++;
3043   PetscFunctionReturn(PETSC_SUCCESS);
3044 }
3045 
3046 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3047 {
3048   Mat_Product                  *product = C->product;
3049   Mat                           A, B;
3050   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3051   Mat_SeqAIJ                   *a, *b, *c;
3052   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3053   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3054   PetscInt                      i, j, m, n, k;
3055   PetscBool                     flg;
3056   cusparseStatus_t              stat;
3057   MatProductType                ptype;
3058   MatMatCusparse               *mmdata;
3059   PetscLogDouble                flops;
3060   PetscBool                     biscompressed, ciscompressed;
3061 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3062   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3063   cusparseSpMatDescr_t BmatSpDescr;
3064 #else
3065   int cnz;
3066 #endif
3067   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3068 
3069   PetscFunctionBegin;
3070   MatCheckProduct(C, 1);
3071   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3072   A = product->A;
3073   B = product->B;
3074   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3075   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3076   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3077   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3078   a = (Mat_SeqAIJ *)A->data;
3079   b = (Mat_SeqAIJ *)B->data;
3080   /* product data */
3081   PetscCall(PetscNew(&mmdata));
3082   C->product->data    = mmdata;
3083   C->product->destroy = MatDestroy_MatMatCusparse;
3084 
3085   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3086   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3087   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3088   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3089   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3090   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3091 
3092   ptype = product->type;
3093   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3094     ptype                                          = MATPRODUCT_AB;
3095     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3096   }
3097   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3098     ptype                                          = MATPRODUCT_AB;
3099     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3100   }
3101   biscompressed = PETSC_FALSE;
3102   ciscompressed = PETSC_FALSE;
3103   switch (ptype) {
3104   case MATPRODUCT_AB:
3105     m    = A->rmap->n;
3106     n    = B->cmap->n;
3107     k    = A->cmap->n;
3108     Amat = Acusp->mat;
3109     Bmat = Bcusp->mat;
3110     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3111     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3112     break;
3113   case MATPRODUCT_AtB:
3114     m = A->cmap->n;
3115     n = B->cmap->n;
3116     k = A->rmap->n;
3117     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3118     Amat = Acusp->matTranspose;
3119     Bmat = Bcusp->mat;
3120     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3121     break;
3122   case MATPRODUCT_ABt:
3123     m = A->rmap->n;
3124     n = B->rmap->n;
3125     k = A->cmap->n;
3126     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3127     Amat = Acusp->mat;
3128     Bmat = Bcusp->matTranspose;
3129     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3130     break;
3131   default:
3132     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3133   }
3134 
3135   /* create cusparse matrix */
3136   PetscCall(MatSetSizes(C, m, n, m, n));
3137   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3138   c     = (Mat_SeqAIJ *)C->data;
3139   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3140   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3141   Ccsr  = new CsrMatrix;
3142 
3143   c->compressedrow.use = ciscompressed;
3144   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3145     c->compressedrow.nrows = a->compressedrow.nrows;
3146     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3147     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3148     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3149     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3150     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3151   } else {
3152     c->compressedrow.nrows  = 0;
3153     c->compressedrow.i      = NULL;
3154     c->compressedrow.rindex = NULL;
3155     Ccusp->workVector       = NULL;
3156     Cmat->cprowIndices      = NULL;
3157   }
3158   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3159   Ccusp->mat        = Cmat;
3160   Ccusp->mat->mat   = Ccsr;
3161   Ccsr->num_rows    = Ccusp->nrows;
3162   Ccsr->num_cols    = n;
3163   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3164   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3165   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3166   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3167   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3168   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3169   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3170   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3171   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3173   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3174     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3175     c->nz                = 0;
3176     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3177     Ccsr->values         = new THRUSTARRAY(c->nz);
3178     goto finalizesym;
3179   }
3180 
3181   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3182   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3183   Acsr = (CsrMatrix *)Amat->mat;
3184   if (!biscompressed) {
3185     Bcsr = (CsrMatrix *)Bmat->mat;
3186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3187     BmatSpDescr = Bmat->matDescr;
3188 #endif
3189   } else { /* we need to use row offsets for the full matrix */
3190     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3191     Bcsr                 = new CsrMatrix;
3192     Bcsr->num_rows       = B->rmap->n;
3193     Bcsr->num_cols       = cBcsr->num_cols;
3194     Bcsr->num_entries    = cBcsr->num_entries;
3195     Bcsr->column_indices = cBcsr->column_indices;
3196     Bcsr->values         = cBcsr->values;
3197     if (!Bcusp->rowoffsets_gpu) {
3198       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3199       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3200       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3201     }
3202     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3203     mmdata->Bcsr      = Bcsr;
3204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3205     if (Bcsr->num_rows && Bcsr->num_cols) {
3206       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3207       PetscCallCUSPARSE(stat);
3208     }
3209     BmatSpDescr = mmdata->matSpBDescr;
3210 #endif
3211   }
3212   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3213   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3214   /* precompute flops count */
3215   if (ptype == MATPRODUCT_AB) {
3216     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3217       const PetscInt st = a->i[i];
3218       const PetscInt en = a->i[i + 1];
3219       for (j = st; j < en; j++) {
3220         const PetscInt brow = a->j[j];
3221         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3222       }
3223     }
3224   } else if (ptype == MATPRODUCT_AtB) {
3225     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3226       const PetscInt anzi = a->i[i + 1] - a->i[i];
3227       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3228       flops += (2. * anzi) * bnzi;
3229     }
3230   } else { /* TODO */
3231     flops = 0.;
3232   }
3233 
3234   mmdata->flops = flops;
3235   PetscCall(PetscLogGpuTimeBegin());
3236 
3237 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3238   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3239   // cuda-12.2 requires non-null csrRowOffsets
3240   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3241   PetscCallCUSPARSE(stat);
3242   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3243   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3244   {
3245     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3246      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3247   */
3248     void *dBuffer1 = NULL;
3249     void *dBuffer2 = NULL;
3250     void *dBuffer3 = NULL;
3251     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3252     size_t bufferSize1 = 0;
3253     size_t bufferSize2 = 0;
3254     size_t bufferSize3 = 0;
3255     size_t bufferSize4 = 0;
3256     size_t bufferSize5 = 0;
3257 
3258     /* ask bufferSize1 bytes for external memory */
3259     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3260     PetscCallCUSPARSE(stat);
3261     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3262     /* inspect the matrices A and B to understand the memory requirement for the next step */
3263     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3264     PetscCallCUSPARSE(stat);
3265 
3266     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3267     PetscCallCUSPARSE(stat);
3268     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3269     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3270     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3271     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3272     PetscCallCUSPARSE(stat);
3273     PetscCallCUDA(cudaFree(dBuffer1));
3274     PetscCallCUDA(cudaFree(dBuffer2));
3275 
3276     /* get matrix C non-zero entries C_nnz1 */
3277     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3278     c->nz = (PetscInt)C_nnz1;
3279     /* allocate matrix C */
3280     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3281     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3282     Ccsr->values = new THRUSTARRAY(c->nz);
3283     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3284     /* update matC with the new pointers */
3285     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3286     PetscCallCUSPARSE(stat);
3287 
3288     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3289     PetscCallCUSPARSE(stat);
3290     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3291     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3292     PetscCallCUSPARSE(stat);
3293     PetscCallCUDA(cudaFree(dBuffer3));
3294     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3295     PetscCallCUSPARSE(stat);
3296     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3297   }
3298   #else
3299   size_t bufSize2;
3300   /* ask bufferSize bytes for external memory */
3301   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3302   PetscCallCUSPARSE(stat);
3303   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3304   /* inspect the matrices A and B to understand the memory requirement for the next step */
3305   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3306   PetscCallCUSPARSE(stat);
3307   /* ask bufferSize again bytes for external memory */
3308   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3309   PetscCallCUSPARSE(stat);
3310   /* The CUSPARSE documentation is not clear, nor the API
3311      We need both buffers to perform the operations properly!
3312      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3313      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3314      is stored in the descriptor! What a messy API... */
3315   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3316   /* compute the intermediate product of A * B */
3317   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3318   PetscCallCUSPARSE(stat);
3319   /* get matrix C non-zero entries C_nnz1 */
3320   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3321   c->nz = (PetscInt)C_nnz1;
3322   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3323                       mmdata->mmBufferSize / 1024));
3324   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3325   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3326   Ccsr->values = new THRUSTARRAY(c->nz);
3327   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3328   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3329   PetscCallCUSPARSE(stat);
3330   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3331   PetscCallCUSPARSE(stat);
3332   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3333 #else
3334   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3335   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3336                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3337   PetscCallCUSPARSE(stat);
3338   c->nz                = cnz;
3339   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3340   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3341   Ccsr->values = new THRUSTARRAY(c->nz);
3342   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3343 
3344   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3345   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3346      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3347      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3348   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3349                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3350   PetscCallCUSPARSE(stat);
3351 #endif
3352   PetscCall(PetscLogGpuFlops(mmdata->flops));
3353   PetscCall(PetscLogGpuTimeEnd());
3354 finalizesym:
3355   c->free_a = PETSC_TRUE;
3356   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3357   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3358   c->free_ij = PETSC_TRUE;
3359   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3360     PetscInt      *d_i = c->i;
3361     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3362     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3363     ii = *Ccsr->row_offsets;
3364     jj = *Ccsr->column_indices;
3365     if (ciscompressed) d_i = c->compressedrow.i;
3366     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3367     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3368   } else {
3369     PetscInt *d_i = c->i;
3370     if (ciscompressed) d_i = c->compressedrow.i;
3371     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3372     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3373   }
3374   if (ciscompressed) { /* need to expand host row offsets */
3375     PetscInt r = 0;
3376     c->i[0]    = 0;
3377     for (k = 0; k < c->compressedrow.nrows; k++) {
3378       const PetscInt next = c->compressedrow.rindex[k];
3379       const PetscInt old  = c->compressedrow.i[k];
3380       for (; r < next; r++) c->i[r + 1] = old;
3381     }
3382     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3383   }
3384   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3385   PetscCall(PetscMalloc1(m, &c->ilen));
3386   PetscCall(PetscMalloc1(m, &c->imax));
3387   c->maxnz         = c->nz;
3388   c->nonzerorowcnt = 0;
3389   c->rmax          = 0;
3390   for (k = 0; k < m; k++) {
3391     const PetscInt nn = c->i[k + 1] - c->i[k];
3392     c->ilen[k] = c->imax[k] = nn;
3393     c->nonzerorowcnt += (PetscInt)!!nn;
3394     c->rmax = PetscMax(c->rmax, nn);
3395   }
3396   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3397   PetscCall(PetscMalloc1(c->nz, &c->a));
3398   Ccsr->num_entries = c->nz;
3399 
3400   C->nonzerostate++;
3401   PetscCall(PetscLayoutSetUp(C->rmap));
3402   PetscCall(PetscLayoutSetUp(C->cmap));
3403   Ccusp->nonzerostate = C->nonzerostate;
3404   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3405   C->preallocated     = PETSC_TRUE;
3406   C->assembled        = PETSC_FALSE;
3407   C->was_assembled    = PETSC_FALSE;
3408   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3409     mmdata->reusesym = PETSC_TRUE;
3410     C->offloadmask   = PETSC_OFFLOAD_GPU;
3411   }
3412   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3413   PetscFunctionReturn(PETSC_SUCCESS);
3414 }
3415 
3416 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3417 
3418 /* handles sparse or dense B */
3419 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3420 {
3421   Mat_Product *product = mat->product;
3422   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3423 
3424   PetscFunctionBegin;
3425   MatCheckProduct(mat, 1);
3426   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3427   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3428   if (product->type == MATPRODUCT_ABC) {
3429     Ciscusp = PETSC_FALSE;
3430     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3431   }
3432   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3433     PetscBool usecpu = PETSC_FALSE;
3434     switch (product->type) {
3435     case MATPRODUCT_AB:
3436       if (product->api_user) {
3437         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3438         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3439         PetscOptionsEnd();
3440       } else {
3441         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3442         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3443         PetscOptionsEnd();
3444       }
3445       break;
3446     case MATPRODUCT_AtB:
3447       if (product->api_user) {
3448         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3449         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3450         PetscOptionsEnd();
3451       } else {
3452         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3453         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3454         PetscOptionsEnd();
3455       }
3456       break;
3457     case MATPRODUCT_PtAP:
3458       if (product->api_user) {
3459         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3460         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3461         PetscOptionsEnd();
3462       } else {
3463         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3464         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3465         PetscOptionsEnd();
3466       }
3467       break;
3468     case MATPRODUCT_RARt:
3469       if (product->api_user) {
3470         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3471         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3472         PetscOptionsEnd();
3473       } else {
3474         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3475         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3476         PetscOptionsEnd();
3477       }
3478       break;
3479     case MATPRODUCT_ABC:
3480       if (product->api_user) {
3481         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3482         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3483         PetscOptionsEnd();
3484       } else {
3485         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3486         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3487         PetscOptionsEnd();
3488       }
3489       break;
3490     default:
3491       break;
3492     }
3493     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3494   }
3495   /* dispatch */
3496   if (isdense) {
3497     switch (product->type) {
3498     case MATPRODUCT_AB:
3499     case MATPRODUCT_AtB:
3500     case MATPRODUCT_ABt:
3501     case MATPRODUCT_PtAP:
3502     case MATPRODUCT_RARt:
3503       if (product->A->boundtocpu) {
3504         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3505       } else {
3506         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3507       }
3508       break;
3509     case MATPRODUCT_ABC:
3510       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3511       break;
3512     default:
3513       break;
3514     }
3515   } else if (Biscusp && Ciscusp) {
3516     switch (product->type) {
3517     case MATPRODUCT_AB:
3518     case MATPRODUCT_AtB:
3519     case MATPRODUCT_ABt:
3520       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3521       break;
3522     case MATPRODUCT_PtAP:
3523     case MATPRODUCT_RARt:
3524     case MATPRODUCT_ABC:
3525       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3526       break;
3527     default:
3528       break;
3529     }
3530   } else { /* fallback for AIJ */
3531     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3532   }
3533   PetscFunctionReturn(PETSC_SUCCESS);
3534 }
3535 
3536 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3537 {
3538   PetscFunctionBegin;
3539   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3540   PetscFunctionReturn(PETSC_SUCCESS);
3541 }
3542 
3543 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3544 {
3545   PetscFunctionBegin;
3546   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3547   PetscFunctionReturn(PETSC_SUCCESS);
3548 }
3549 
3550 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3551 {
3552   PetscFunctionBegin;
3553   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3554   PetscFunctionReturn(PETSC_SUCCESS);
3555 }
3556 
3557 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3558 {
3559   PetscFunctionBegin;
3560   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3561   PetscFunctionReturn(PETSC_SUCCESS);
3562 }
3563 
3564 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3565 {
3566   PetscFunctionBegin;
3567   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3568   PetscFunctionReturn(PETSC_SUCCESS);
3569 }
3570 
3571 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3572 {
3573   int i = blockIdx.x * blockDim.x + threadIdx.x;
3574   if (i < n) y[idx[i]] += x[i];
3575 }
3576 
3577 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3578 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3579 {
3580   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3581   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3582   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3583   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3584   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3585   PetscBool                     compressed;
3586 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3587   PetscInt nx, ny;
3588 #endif
3589 
3590   PetscFunctionBegin;
3591   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3592   if (!a->nz) {
3593     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3594     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3595     PetscFunctionReturn(PETSC_SUCCESS);
3596   }
3597   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3598   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3599   if (!trans) {
3600     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3601     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3602   } else {
3603     if (herm || !A->form_explicit_transpose) {
3604       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3605       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3606     } else {
3607       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3608       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3609     }
3610   }
3611   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3612   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3613 
3614   try {
3615     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3616     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3617     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3618 
3619     PetscCall(PetscLogGpuTimeBegin());
3620     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3621       /* z = A x + beta y.
3622          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3623          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3624       */
3625       xptr = xarray;
3626       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3627       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3628 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3629       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3630           allocated to accommodate different uses. So we get the length info directly from mat.
3631        */
3632       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3633         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634         nx             = mat->num_cols; // since y = Ax
3635         ny             = mat->num_rows;
3636       }
3637 #endif
3638     } else {
3639       /* z = A^T x + beta y
3640          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3641          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3642        */
3643       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3644       dptr = zarray;
3645       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3646       if (compressed) { /* Scatter x to work vector */
3647         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3648 
3649         thrust::for_each(
3650 #if PetscDefined(HAVE_THRUST_ASYNC)
3651           thrust::cuda::par.on(PetscDefaultCudaStream),
3652 #endif
3653           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3654           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3655       }
3656 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3657       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3658         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3659         nx             = mat->num_rows; // since y = A^T x
3660         ny             = mat->num_cols;
3661       }
3662 #endif
3663     }
3664 
3665     /* csr_spmv does y = alpha op(A) x + beta y */
3666     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3667 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3668   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3669       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3670   #else
3671       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3672   #endif
3673 
3674       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3675   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3676       if (!matDescr) {
3677         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3678         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3679       }
3680   #endif
3681 
3682       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3683         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3684         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3685         PetscCallCUSPARSE(
3686           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3687         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3688   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3689         PetscCallCUSPARSE(
3690           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3691   #endif
3692         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3693       } else {
3694         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3695         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3696         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3697       }
3698 
3699       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3700 #else
3701       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3702       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3703 #endif
3704     } else {
3705       if (cusparsestruct->nrows) {
3706 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3707         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3708 #else
3709         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3710         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3711 #endif
3712       }
3713     }
3714     PetscCall(PetscLogGpuTimeEnd());
3715 
3716     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3717       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3718         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3719           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3720         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3721           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3722         }
3723       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3724         PetscCall(VecSeq_CUDA::Set(zz, 0));
3725       }
3726 
3727       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3728       if (compressed) {
3729         PetscCall(PetscLogGpuTimeBegin());
3730         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3731         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3732         PetscCall(PetscLogGpuTimeEnd());
3733       }
3734     } else {
3735       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3736     }
3737     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3738     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3739     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3740   } catch (char *ex) {
3741     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3742   }
3743   if (yy) {
3744     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3745   } else {
3746     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3747   }
3748   PetscFunctionReturn(PETSC_SUCCESS);
3749 }
3750 
3751 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3752 {
3753   PetscFunctionBegin;
3754   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3755   PetscFunctionReturn(PETSC_SUCCESS);
3756 }
3757 
3758 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3759 {
3760   PetscFunctionBegin;
3761   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3762   PetscFunctionReturn(PETSC_SUCCESS);
3763 }
3764 
3765 /*@
3766   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3767   (the default parallel PETSc format).
3768 
3769   Collective
3770 
3771   Input Parameters:
3772 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3773 . m    - number of rows
3774 . n    - number of columns
3775 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3776 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3777 
3778   Output Parameter:
3779 . A - the matrix
3780 
3781   Level: intermediate
3782 
3783   Notes:
3784   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3785   calculations. For good matrix assembly performance the user should preallocate the matrix
3786   storage by setting the parameter `nz` (or the array `nnz`).
3787 
3788   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3789   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3790   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3791 
3792   The AIJ format, also called
3793   compressed row storage, is fully compatible with standard Fortran
3794   storage.  That is, the stored row and column indices can begin at
3795   either one (as in Fortran) or zero.
3796 
3797   Specify the preallocated storage with either nz or nnz (not both).
3798   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3799   allocation.
3800 
3801 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3802 @*/
3803 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3804 {
3805   PetscFunctionBegin;
3806   PetscCall(MatCreate(comm, A));
3807   PetscCall(MatSetSizes(*A, m, n, m, n));
3808   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3809   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3810   PetscFunctionReturn(PETSC_SUCCESS);
3811 }
3812 
3813 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3814 {
3815   PetscFunctionBegin;
3816   if (A->factortype == MAT_FACTOR_NONE) {
3817     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3818   } else {
3819     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3820   }
3821   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3822   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3823   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3824   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3825   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3826   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3827   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3828   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3829   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3830   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3831   PetscCall(MatDestroy_SeqAIJ(A));
3832   PetscFunctionReturn(PETSC_SUCCESS);
3833 }
3834 
3835 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3836 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3837 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3838 {
3839   PetscFunctionBegin;
3840   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3841   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3842   PetscFunctionReturn(PETSC_SUCCESS);
3843 }
3844 
3845 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3846 {
3847   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3848   Mat_SeqAIJCUSPARSE *cy;
3849   Mat_SeqAIJCUSPARSE *cx;
3850   PetscScalar        *ay;
3851   const PetscScalar  *ax;
3852   CsrMatrix          *csry, *csrx;
3853 
3854   PetscFunctionBegin;
3855   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3856   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3857   if (X->ops->axpy != Y->ops->axpy) {
3858     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3859     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3860     PetscFunctionReturn(PETSC_SUCCESS);
3861   }
3862   /* if we are here, it means both matrices are bound to GPU */
3863   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3864   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3865   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3866   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3867   csry = (CsrMatrix *)cy->mat->mat;
3868   csrx = (CsrMatrix *)cx->mat->mat;
3869   /* see if we can turn this into a cublas axpy */
3870   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3871     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3872     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3873     if (eq) str = SAME_NONZERO_PATTERN;
3874   }
3875   /* spgeam is buggy with one column */
3876   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3877 
3878   if (str == SUBSET_NONZERO_PATTERN) {
3879     PetscScalar b = 1.0;
3880 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3881     size_t bufferSize;
3882     void  *buffer;
3883 #endif
3884 
3885     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3886     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3887     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3888 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3889     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3890                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3891     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3892     PetscCall(PetscLogGpuTimeBegin());
3893     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3894                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3895     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3896     PetscCall(PetscLogGpuTimeEnd());
3897     PetscCallCUDA(cudaFree(buffer));
3898 #else
3899     PetscCall(PetscLogGpuTimeBegin());
3900     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3901                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3902     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3903     PetscCall(PetscLogGpuTimeEnd());
3904 #endif
3905     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3906     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3907     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3908     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3909   } else if (str == SAME_NONZERO_PATTERN) {
3910     cublasHandle_t cublasv2handle;
3911     PetscBLASInt   one = 1, bnz = 1;
3912 
3913     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3914     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3915     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3916     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3917     PetscCall(PetscLogGpuTimeBegin());
3918     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3919     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3920     PetscCall(PetscLogGpuTimeEnd());
3921     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3922     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3923     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3924   } else {
3925     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3926     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3927   }
3928   PetscFunctionReturn(PETSC_SUCCESS);
3929 }
3930 
3931 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3932 {
3933   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3934   PetscScalar   *ay;
3935   cublasHandle_t cublasv2handle;
3936   PetscBLASInt   one = 1, bnz = 1;
3937 
3938   PetscFunctionBegin;
3939   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3940   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3941   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3942   PetscCall(PetscLogGpuTimeBegin());
3943   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3944   PetscCall(PetscLogGpuFlops(bnz));
3945   PetscCall(PetscLogGpuTimeEnd());
3946   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3947   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3948   PetscFunctionReturn(PETSC_SUCCESS);
3949 }
3950 
3951 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3952 {
3953   PetscBool   both = PETSC_FALSE;
3954   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3955 
3956   PetscFunctionBegin;
3957   if (A->factortype == MAT_FACTOR_NONE) {
3958     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3959     if (spptr->mat) {
3960       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3961       if (matrix->values) {
3962         both = PETSC_TRUE;
3963         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3964       }
3965     }
3966     if (spptr->matTranspose) {
3967       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3968       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3969     }
3970   }
3971   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3972   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3973   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3974   else A->offloadmask = PETSC_OFFLOAD_CPU;
3975   PetscFunctionReturn(PETSC_SUCCESS);
3976 }
3977 
3978 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3979 {
3980   PetscFunctionBegin;
3981   *m = PETSC_MEMTYPE_CUDA;
3982   PetscFunctionReturn(PETSC_SUCCESS);
3983 }
3984 
3985 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3986 {
3987   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3988 
3989   PetscFunctionBegin;
3990   if (A->factortype != MAT_FACTOR_NONE) {
3991     A->boundtocpu = flg;
3992     PetscFunctionReturn(PETSC_SUCCESS);
3993   }
3994   if (flg) {
3995     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3996 
3997     A->ops->scale                     = MatScale_SeqAIJ;
3998     A->ops->axpy                      = MatAXPY_SeqAIJ;
3999     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4000     A->ops->mult                      = MatMult_SeqAIJ;
4001     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4002     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4003     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4004     A->ops->multhermitiantranspose    = NULL;
4005     A->ops->multhermitiantransposeadd = NULL;
4006     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4007     A->ops->getcurrentmemtype         = NULL;
4008     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4009     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4010     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4011     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4012     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4013     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4014     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4015   } else {
4016     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4017     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4018     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4019     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4020     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4021     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4022     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4023     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4024     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4025     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4026     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4027     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4028     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4029     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4030     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4031     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4032     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4033     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4034 
4035     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4036     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4037     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4038     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4039     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4040     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4041   }
4042   A->boundtocpu = flg;
4043   if (flg && a->inode.size_csr) {
4044     a->inode.use = PETSC_TRUE;
4045   } else {
4046     a->inode.use = PETSC_FALSE;
4047   }
4048   PetscFunctionReturn(PETSC_SUCCESS);
4049 }
4050 
4051 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4052 {
4053   Mat B;
4054 
4055   PetscFunctionBegin;
4056   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4057   if (reuse == MAT_INITIAL_MATRIX) {
4058     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4059   } else if (reuse == MAT_REUSE_MATRIX) {
4060     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4061   }
4062   B = *newmat;
4063 
4064   PetscCall(PetscFree(B->defaultvectype));
4065   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4066 
4067   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4068     if (B->factortype == MAT_FACTOR_NONE) {
4069       Mat_SeqAIJCUSPARSE *spptr;
4070       PetscCall(PetscNew(&spptr));
4071       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4072       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4073       spptr->format = MAT_CUSPARSE_CSR;
4074 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4075   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4076       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4077   #else
4078       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4079   #endif
4080       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4081       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4082 #endif
4083       B->spptr = spptr;
4084     } else {
4085       Mat_SeqAIJCUSPARSETriFactors *spptr;
4086 
4087       PetscCall(PetscNew(&spptr));
4088       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4089       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4090       B->spptr = spptr;
4091     }
4092     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4093   }
4094   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4095   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4096   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4097   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4098   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4099   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4100   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4101 
4102   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4103   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4104   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4105 #if defined(PETSC_HAVE_HYPRE)
4106   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4107 #endif
4108   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4109   PetscFunctionReturn(PETSC_SUCCESS);
4110 }
4111 
4112 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4113 {
4114   PetscFunctionBegin;
4115   PetscCall(MatCreate_SeqAIJ(B));
4116   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4117   PetscFunctionReturn(PETSC_SUCCESS);
4118 }
4119 
4120 /*MC
4121    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4122 
4123    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4124    CSR, ELL, or Hybrid format.
4125    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4126 
4127    Options Database Keys:
4128 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4129 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4130                                       Other options include ell (ellpack) or hyb (hybrid).
4131 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4132 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4133 
4134   Level: beginner
4135 
4136 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4137 M*/
4138 
4139 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4140 {
4141   PetscFunctionBegin;
4142   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4143   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4144   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4145   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4146   PetscFunctionReturn(PETSC_SUCCESS);
4147 }
4148 
4149 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4150 {
4151   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4152 
4153   PetscFunctionBegin;
4154   if (cusp) {
4155     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4156     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4157     delete cusp->workVector;
4158     delete cusp->rowoffsets_gpu;
4159     delete cusp->csr2csc_i;
4160     delete cusp->coords;
4161     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4162     PetscCall(PetscFree(mat->spptr));
4163   }
4164   PetscFunctionReturn(PETSC_SUCCESS);
4165 }
4166 
4167 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4168 {
4169   PetscFunctionBegin;
4170   if (*mat) {
4171     delete (*mat)->values;
4172     delete (*mat)->column_indices;
4173     delete (*mat)->row_offsets;
4174     delete *mat;
4175     *mat = 0;
4176   }
4177   PetscFunctionReturn(PETSC_SUCCESS);
4178 }
4179 
4180 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4181 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4182 {
4183   PetscFunctionBegin;
4184   if (*trifactor) {
4185     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4186     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4187     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4188     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4189     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4190   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4191     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4192   #endif
4193     PetscCall(PetscFree(*trifactor));
4194   }
4195   PetscFunctionReturn(PETSC_SUCCESS);
4196 }
4197 #endif
4198 
4199 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4200 {
4201   CsrMatrix *mat;
4202 
4203   PetscFunctionBegin;
4204   if (*matstruct) {
4205     if ((*matstruct)->mat) {
4206       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4207 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4208         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4209 #else
4210         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4211         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4212 #endif
4213       } else {
4214         mat = (CsrMatrix *)(*matstruct)->mat;
4215         PetscCall(CsrMatrix_Destroy(&mat));
4216       }
4217     }
4218     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4219     delete (*matstruct)->cprowIndices;
4220     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4221     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4222     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4223 
4224 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4225     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4226     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4227 
4228     for (int i = 0; i < 3; i++) {
4229       if (mdata->cuSpMV[i].initialized) {
4230         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4231         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4232         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4233   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4234         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4235         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4236   #endif
4237       }
4238     }
4239 #endif
4240     delete *matstruct;
4241     *matstruct = NULL;
4242   }
4243   PetscFunctionReturn(PETSC_SUCCESS);
4244 }
4245 
4246 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4247 {
4248   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4249 
4250   PetscFunctionBegin;
4251   if (fs) {
4252 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4253     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4254     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4255     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4256     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4257     delete fs->workVector;
4258     fs->workVector = NULL;
4259 #endif
4260     delete fs->rpermIndices;
4261     delete fs->cpermIndices;
4262     fs->rpermIndices  = NULL;
4263     fs->cpermIndices  = NULL;
4264     fs->init_dev_prop = PETSC_FALSE;
4265 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4266     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4267     PetscCallCUDA(cudaFree(fs->csrColIdx));
4268     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4269     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4270     PetscCallCUDA(cudaFree(fs->csrVal));
4271     PetscCallCUDA(cudaFree(fs->diag));
4272     PetscCallCUDA(cudaFree(fs->X));
4273     PetscCallCUDA(cudaFree(fs->Y));
4274     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4275     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4276     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4277     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4278     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4279     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4280     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4281     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4282     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4283     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4284     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4285     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4286     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4287     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4288     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4289     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4290     PetscCall(PetscFree(fs->csrRowPtr_h));
4291     PetscCall(PetscFree(fs->csrVal_h));
4292     PetscCall(PetscFree(fs->diag_h));
4293     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4294     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4295 #endif
4296   }
4297   PetscFunctionReturn(PETSC_SUCCESS);
4298 }
4299 
4300 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4301 {
4302   PetscFunctionBegin;
4303   if (*trifactors) {
4304     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4305     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4306     PetscCall(PetscFree(*trifactors));
4307   }
4308   PetscFunctionReturn(PETSC_SUCCESS);
4309 }
4310 
4311 struct IJCompare {
4312   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4313   {
4314     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4315     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4316     return false;
4317   }
4318 };
4319 
4320 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4321 {
4322   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4323 
4324   PetscFunctionBegin;
4325   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4326   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4327   if (destroy) {
4328     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4329     delete cusp->csr2csc_i;
4330     cusp->csr2csc_i = NULL;
4331   }
4332   A->transupdated = PETSC_FALSE;
4333   PetscFunctionReturn(PETSC_SUCCESS);
4334 }
4335 
4336 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4337 {
4338   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4339 
4340   PetscFunctionBegin;
4341   PetscCallCUDA(cudaFree(coo->perm));
4342   PetscCallCUDA(cudaFree(coo->jmap));
4343   PetscCall(PetscFree(coo));
4344   PetscFunctionReturn(PETSC_SUCCESS);
4345 }
4346 
4347 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4348 {
4349   PetscBool            dev_ij = PETSC_FALSE;
4350   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4351   PetscInt            *i, *j;
4352   PetscContainer       container_h;
4353   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4354 
4355   PetscFunctionBegin;
4356   PetscCall(PetscGetMemType(coo_i, &mtype));
4357   if (PetscMemTypeDevice(mtype)) {
4358     dev_ij = PETSC_TRUE;
4359     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4360     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4361     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4362   } else {
4363     i = coo_i;
4364     j = coo_j;
4365   }
4366 
4367   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4368   if (dev_ij) PetscCall(PetscFree2(i, j));
4369   mat->offloadmask = PETSC_OFFLOAD_CPU;
4370   // Create the GPU memory
4371   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4372 
4373   // Copy the COO struct to device
4374   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4375   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4376   PetscCall(PetscMalloc1(1, &coo_d));
4377   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4378   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4379   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4380   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4381   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4382 
4383   // Put the COO struct in a container and then attach that to the matrix
4384   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4385   PetscFunctionReturn(PETSC_SUCCESS);
4386 }
4387 
4388 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4389 {
4390   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4391   const PetscCount grid_size = gridDim.x * blockDim.x;
4392   for (; i < nnz; i += grid_size) {
4393     PetscScalar sum = 0.0;
4394     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4395     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4396   }
4397 }
4398 
4399 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4400 {
4401   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4402   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4403   PetscCount           Annz = seq->nz;
4404   PetscMemType         memtype;
4405   const PetscScalar   *v1 = v;
4406   PetscScalar         *Aa;
4407   PetscContainer       container;
4408   MatCOOStruct_SeqAIJ *coo;
4409 
4410   PetscFunctionBegin;
4411   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4412 
4413   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4414   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4415 
4416   PetscCall(PetscGetMemType(v, &memtype));
4417   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4418     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4419     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4420   }
4421 
4422   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4423   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4424 
4425   PetscCall(PetscLogGpuTimeBegin());
4426   if (Annz) {
4427     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4428     PetscCallCUDA(cudaPeekAtLastError());
4429   }
4430   PetscCall(PetscLogGpuTimeEnd());
4431 
4432   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4433   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4434 
4435   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4436   PetscFunctionReturn(PETSC_SUCCESS);
4437 }
4438 
4439 /*@C
4440   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4441 
4442   Not Collective
4443 
4444   Input Parameters:
4445 + A          - the matrix
4446 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4447 
4448   Output Parameters:
4449 + i - the CSR row pointers
4450 - j - the CSR column indices
4451 
4452   Level: developer
4453 
4454   Note:
4455   When compressed is true, the CSR structure does not contain empty rows
4456 
4457 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4458 @*/
4459 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4460 {
4461   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4462   CsrMatrix          *csr;
4463   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4464 
4465   PetscFunctionBegin;
4466   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4467   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4468   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4469   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4470   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4471   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4472   csr = (CsrMatrix *)cusp->mat->mat;
4473   if (i) {
4474     if (!compressed && a->compressedrow.use) { /* need full row offset */
4475       if (!cusp->rowoffsets_gpu) {
4476         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4477         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4478         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4479       }
4480       *i = cusp->rowoffsets_gpu->data().get();
4481     } else *i = csr->row_offsets->data().get();
4482   }
4483   if (j) *j = csr->column_indices->data().get();
4484   PetscFunctionReturn(PETSC_SUCCESS);
4485 }
4486 
4487 /*@C
4488   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4489 
4490   Not Collective
4491 
4492   Input Parameters:
4493 + A          - the matrix
4494 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4495 . i          - the CSR row pointers
4496 - j          - the CSR column indices
4497 
4498   Level: developer
4499 
4500 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4501 @*/
4502 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4503 {
4504   PetscFunctionBegin;
4505   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4506   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4507   if (i) *i = NULL;
4508   if (j) *j = NULL;
4509   (void)compressed;
4510   PetscFunctionReturn(PETSC_SUCCESS);
4511 }
4512 
4513 /*@C
4514   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4515 
4516   Not Collective
4517 
4518   Input Parameter:
4519 . A - a `MATSEQAIJCUSPARSE` matrix
4520 
4521   Output Parameter:
4522 . a - pointer to the device data
4523 
4524   Level: developer
4525 
4526   Note:
4527   May trigger host-device copies if up-to-date matrix data is on host
4528 
4529 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4530 @*/
4531 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4532 {
4533   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4534   CsrMatrix          *csr;
4535 
4536   PetscFunctionBegin;
4537   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4538   PetscAssertPointer(a, 2);
4539   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4540   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4541   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4542   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4543   csr = (CsrMatrix *)cusp->mat->mat;
4544   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4545   *a = csr->values->data().get();
4546   PetscFunctionReturn(PETSC_SUCCESS);
4547 }
4548 
4549 /*@C
4550   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4551 
4552   Not Collective
4553 
4554   Input Parameters:
4555 + A - a `MATSEQAIJCUSPARSE` matrix
4556 - a - pointer to the device data
4557 
4558   Level: developer
4559 
4560 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4561 @*/
4562 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4563 {
4564   PetscFunctionBegin;
4565   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4566   PetscAssertPointer(a, 2);
4567   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4568   *a = NULL;
4569   PetscFunctionReturn(PETSC_SUCCESS);
4570 }
4571 
4572 /*@C
4573   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4574 
4575   Not Collective
4576 
4577   Input Parameter:
4578 . A - a `MATSEQAIJCUSPARSE` matrix
4579 
4580   Output Parameter:
4581 . a - pointer to the device data
4582 
4583   Level: developer
4584 
4585   Note:
4586   May trigger host-device copies if up-to-date matrix data is on host
4587 
4588 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4589 @*/
4590 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4591 {
4592   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4593   CsrMatrix          *csr;
4594 
4595   PetscFunctionBegin;
4596   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4597   PetscAssertPointer(a, 2);
4598   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4600   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4601   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4602   csr = (CsrMatrix *)cusp->mat->mat;
4603   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4604   *a             = csr->values->data().get();
4605   A->offloadmask = PETSC_OFFLOAD_GPU;
4606   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4607   PetscFunctionReturn(PETSC_SUCCESS);
4608 }
4609 /*@C
4610   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4611 
4612   Not Collective
4613 
4614   Input Parameters:
4615 + A - a `MATSEQAIJCUSPARSE` matrix
4616 - a - pointer to the device data
4617 
4618   Level: developer
4619 
4620 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4621 @*/
4622 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4623 {
4624   PetscFunctionBegin;
4625   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4626   PetscAssertPointer(a, 2);
4627   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4628   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4629   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4630   *a = NULL;
4631   PetscFunctionReturn(PETSC_SUCCESS);
4632 }
4633 
4634 /*@C
4635   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4636 
4637   Not Collective
4638 
4639   Input Parameter:
4640 . A - a `MATSEQAIJCUSPARSE` matrix
4641 
4642   Output Parameter:
4643 . a - pointer to the device data
4644 
4645   Level: developer
4646 
4647   Note:
4648   Does not trigger host-device copies and flags data validity on the GPU
4649 
4650 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4651 @*/
4652 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4653 {
4654   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4655   CsrMatrix          *csr;
4656 
4657   PetscFunctionBegin;
4658   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4659   PetscAssertPointer(a, 2);
4660   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4661   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4662   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4663   csr = (CsrMatrix *)cusp->mat->mat;
4664   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4665   *a             = csr->values->data().get();
4666   A->offloadmask = PETSC_OFFLOAD_GPU;
4667   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4668   PetscFunctionReturn(PETSC_SUCCESS);
4669 }
4670 
4671 /*@C
4672   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4673 
4674   Not Collective
4675 
4676   Input Parameters:
4677 + A - a `MATSEQAIJCUSPARSE` matrix
4678 - a - pointer to the device data
4679 
4680   Level: developer
4681 
4682 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4683 @*/
4684 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4685 {
4686   PetscFunctionBegin;
4687   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4688   PetscAssertPointer(a, 2);
4689   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4690   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4691   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4692   *a = NULL;
4693   PetscFunctionReturn(PETSC_SUCCESS);
4694 }
4695 
4696 struct IJCompare4 {
4697   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4698   {
4699     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4700     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4701     return false;
4702   }
4703 };
4704 
4705 struct Shift {
4706   int _shift;
4707 
4708   Shift(int shift) : _shift(shift) { }
4709   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4710 };
4711 
4712 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4713 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4714 {
4715   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4716   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4717   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4718   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4719   PetscInt                      Annz, Bnnz;
4720   cusparseStatus_t              stat;
4721   PetscInt                      i, m, n, zero = 0;
4722 
4723   PetscFunctionBegin;
4724   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4725   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4726   PetscAssertPointer(C, 4);
4727   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4728   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4729   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4730   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4731   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4732   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733   if (reuse == MAT_INITIAL_MATRIX) {
4734     m = A->rmap->n;
4735     n = A->cmap->n + B->cmap->n;
4736     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4737     PetscCall(MatSetSizes(*C, m, n, m, n));
4738     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4739     c                       = (Mat_SeqAIJ *)(*C)->data;
4740     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4741     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4742     Ccsr                    = new CsrMatrix;
4743     Cmat->cprowIndices      = NULL;
4744     c->compressedrow.use    = PETSC_FALSE;
4745     c->compressedrow.nrows  = 0;
4746     c->compressedrow.i      = NULL;
4747     c->compressedrow.rindex = NULL;
4748     Ccusp->workVector       = NULL;
4749     Ccusp->nrows            = m;
4750     Ccusp->mat              = Cmat;
4751     Ccusp->mat->mat         = Ccsr;
4752     Ccsr->num_rows          = m;
4753     Ccsr->num_cols          = n;
4754     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4755     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4756     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4757     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4758     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4759     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4760     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4761     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4763     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4764     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4765     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4766     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4767 
4768     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4769     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4770     Annz                 = (PetscInt)Acsr->column_indices->size();
4771     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4772     c->nz                = Annz + Bnnz;
4773     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4774     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4775     Ccsr->values         = new THRUSTARRAY(c->nz);
4776     Ccsr->num_entries    = c->nz;
4777     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4778     if (c->nz) {
4779       auto              Acoo = new THRUSTINTARRAY32(Annz);
4780       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4781       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4782       THRUSTINTARRAY32 *Aroff, *Broff;
4783 
4784       if (a->compressedrow.use) { /* need full row offset */
4785         if (!Acusp->rowoffsets_gpu) {
4786           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4787           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4788           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4789         }
4790         Aroff = Acusp->rowoffsets_gpu;
4791       } else Aroff = Acsr->row_offsets;
4792       if (b->compressedrow.use) { /* need full row offset */
4793         if (!Bcusp->rowoffsets_gpu) {
4794           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4795           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4796           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4797         }
4798         Broff = Bcusp->rowoffsets_gpu;
4799       } else Broff = Bcsr->row_offsets;
4800       PetscCall(PetscLogGpuTimeBegin());
4801       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4802       PetscCallCUSPARSE(stat);
4803       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4804       PetscCallCUSPARSE(stat);
4805       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4806       auto Aperm = thrust::make_constant_iterator(1);
4807       auto Bperm = thrust::make_constant_iterator(0);
4808 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4809       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4810       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4811 #else
4812       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4813       auto Bcib = Bcsr->column_indices->begin();
4814       auto Bcie = Bcsr->column_indices->end();
4815       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4816 #endif
4817       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4818       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4819       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4820       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4821       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4822       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4823       auto p1    = Ccusp->coords->begin();
4824       auto p2    = Ccusp->coords->begin();
4825       thrust::advance(p2, Annz);
4826       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4827 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4828       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4829 #endif
4830       auto cci = thrust::make_counting_iterator(zero);
4831       auto cce = thrust::make_counting_iterator(c->nz);
4832 #if 0 //Errors on SUMMIT cuda 11.1.0
4833       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4834 #else
4835   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4836       auto pred = thrust::identity<int>();
4837   #else
4838       auto pred = cuda::std::identity();
4839   #endif
4840       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4841       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4842 #endif
4843       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4844       PetscCallCUSPARSE(stat);
4845       PetscCall(PetscLogGpuTimeEnd());
4846       delete wPerm;
4847       delete Acoo;
4848       delete Bcoo;
4849       delete Ccoo;
4850 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4851       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4852       PetscCallCUSPARSE(stat);
4853 #endif
4854       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4855         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4856         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4857         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4858         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4859         CsrMatrix                    *CcsrT = new CsrMatrix;
4860         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4861         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4862 
4863         (*C)->form_explicit_transpose = PETSC_TRUE;
4864         (*C)->transupdated            = PETSC_TRUE;
4865         Ccusp->rowoffsets_gpu         = NULL;
4866         CmatT->cprowIndices           = NULL;
4867         CmatT->mat                    = CcsrT;
4868         CcsrT->num_rows               = n;
4869         CcsrT->num_cols               = m;
4870         CcsrT->num_entries            = c->nz;
4871 
4872         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4873         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4874         CcsrT->values         = new THRUSTARRAY(c->nz);
4875 
4876         PetscCall(PetscLogGpuTimeBegin());
4877         auto rT = CcsrT->row_offsets->begin();
4878         if (AT) {
4879           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4880           thrust::advance(rT, -1);
4881         }
4882         if (BT) {
4883           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4884           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4885           thrust::copy(titb, tite, rT);
4886         }
4887         auto cT = CcsrT->column_indices->begin();
4888         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4889         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4890         auto vT = CcsrT->values->begin();
4891         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4892         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4893         PetscCall(PetscLogGpuTimeEnd());
4894 
4895         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4896         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4897         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4898         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4899         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4900         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4901         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4902         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4903         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4904 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4905         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4906         PetscCallCUSPARSE(stat);
4907 #endif
4908         Ccusp->matTranspose = CmatT;
4909       }
4910     }
4911 
4912     c->free_a = PETSC_TRUE;
4913     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4914     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4915     c->free_ij = PETSC_TRUE;
4916     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4917       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4918       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4919       ii = *Ccsr->row_offsets;
4920       jj = *Ccsr->column_indices;
4921       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4922       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4923     } else {
4924       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4925       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4926     }
4927     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4928     PetscCall(PetscMalloc1(m, &c->ilen));
4929     PetscCall(PetscMalloc1(m, &c->imax));
4930     c->maxnz         = c->nz;
4931     c->nonzerorowcnt = 0;
4932     c->rmax          = 0;
4933     for (i = 0; i < m; i++) {
4934       const PetscInt nn = c->i[i + 1] - c->i[i];
4935       c->ilen[i] = c->imax[i] = nn;
4936       c->nonzerorowcnt += (PetscInt)!!nn;
4937       c->rmax = PetscMax(c->rmax, nn);
4938     }
4939     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4940     PetscCall(PetscMalloc1(c->nz, &c->a));
4941     (*C)->nonzerostate++;
4942     PetscCall(PetscLayoutSetUp((*C)->rmap));
4943     PetscCall(PetscLayoutSetUp((*C)->cmap));
4944     Ccusp->nonzerostate = (*C)->nonzerostate;
4945     (*C)->preallocated  = PETSC_TRUE;
4946   } else {
4947     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4948     c = (Mat_SeqAIJ *)(*C)->data;
4949     if (c->nz) {
4950       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4951       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4952       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4953       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4954       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4955       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4956       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4957       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4958       Acsr = (CsrMatrix *)Acusp->mat->mat;
4959       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4960       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4961       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4962       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4963       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4964       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4965       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4966       auto pmid = Ccusp->coords->begin();
4967       thrust::advance(pmid, Acsr->num_entries);
4968       PetscCall(PetscLogGpuTimeBegin());
4969       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4970       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4971       thrust::for_each(zibait, zieait, VecCUDAEquals());
4972       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4973       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4974       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4975       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4976       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4977         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4978         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4979         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4980         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4981         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4982         auto       vT    = CcsrT->values->begin();
4983         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4984         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4985         (*C)->transupdated = PETSC_TRUE;
4986       }
4987       PetscCall(PetscLogGpuTimeEnd());
4988     }
4989   }
4990   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4991   (*C)->assembled     = PETSC_TRUE;
4992   (*C)->was_assembled = PETSC_FALSE;
4993   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4994   PetscFunctionReturn(PETSC_SUCCESS);
4995 }
4996 
4997 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4998 {
4999   bool               dmem;
5000   const PetscScalar *av;
5001 
5002   PetscFunctionBegin;
5003   dmem = isCudaMem(v);
5004   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5005   if (n && idx) {
5006     THRUSTINTARRAY widx(n);
5007     widx.assign(idx, idx + n);
5008     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5009 
5010     THRUSTARRAY                    *w = NULL;
5011     thrust::device_ptr<PetscScalar> dv;
5012     if (dmem) {
5013       dv = thrust::device_pointer_cast(v);
5014     } else {
5015       w  = new THRUSTARRAY(n);
5016       dv = w->data();
5017     }
5018     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5019 
5020     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5021     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5022     thrust::for_each(zibit, zieit, VecCUDAEquals());
5023     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5024     delete w;
5025   } else {
5026     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5027   }
5028   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5029   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5030   PetscFunctionReturn(PETSC_SUCCESS);
5031 }
5032