xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1d27aa22b2f6148b2c4e3f06a75e0638d6493e09)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72 #endif
73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
83 
84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95 
96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97 {
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
99 
100   PetscFunctionBegin;
101   switch (op) {
102   case MAT_CUSPARSE_MULT:
103     cusparsestruct->format = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparsestruct->format = format;
107     break;
108   default:
109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*@
115   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116   operation. Only the `MatMult()` operation can use different GPU storage formats
117 
118   Not Collective
119 
120   Input Parameters:
121 + A      - Matrix of type `MATSEQAIJCUSPARSE`
122 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125 
126   Level: intermediate
127 
128 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@
148   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149 
150   Input Parameters:
151 + A       - Matrix of type `MATSEQAIJCUSPARSE`
152 - use_cpu - set flag for using the built-in CPU `MatSolve()`
153 
154   Level: intermediate
155 
156   Note:
157   The cuSparse LU solver currently computes the factors with the built-in CPU method
158   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162 @*/
163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164 {
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172 {
173   PetscFunctionBegin;
174   switch (op) {
175   case MAT_FORM_EXPLICIT_TRANSPOSE:
176     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178     A->form_explicit_transpose = flg;
179     break;
180   default:
181     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182     break;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188 {
189   MatCUSPARSEStorageFormat format;
190   PetscBool                flg;
191   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
192 
193   PetscFunctionBegin;
194   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195   if (A->factortype == MAT_FACTOR_NONE) {
196     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198 
199     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208   #else
209     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #endif
211     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213 
214     PetscCall(
215       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217 #endif
218   }
219   PetscOptionsHeadEnd();
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225 {
226   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227   PetscInt                      m  = A->rmap->n;
228   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230   const MatScalar              *Aa = a->a;
231   PetscInt                     *Mi, *Mj, Mnz;
232   PetscScalar                  *Ma;
233 
234   PetscFunctionBegin;
235   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239       PetscCall(PetscMalloc1(m + 1, &Mi));
240       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241       PetscCall(PetscMalloc1(Mnz, &Ma));
242       Mi[0] = 0;
243       for (PetscInt i = 0; i < m; i++) {
244         PetscInt llen = Ai[i + 1] - Ai[i];
245         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249         Mi[i + 1] = Mi[i] + llen + ulen;
250       }
251       // Copy M (L,U) from host to device
252       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
257 
258       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266 
267       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270 
271       fillMode = CUSPARSE_FILL_MODE_UPPER;
272       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276 
277       // Allocate work vectors in SpSv
278       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
280 
281       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283 
284       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291 
292       // Record for reuse
293       fs->csrRowPtr_h = Mi;
294       fs->csrVal_h    = Ma;
295       PetscCall(PetscFree(Mj));
296     }
297     // Copy the value
298     Mi  = fs->csrRowPtr_h;
299     Ma  = fs->csrVal_h;
300     Mnz = Mi[m];
301     for (PetscInt i = 0; i < m; i++) {
302       PetscInt llen = Ai[i + 1] - Ai[i];
303       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307     }
308     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309 
310     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312 
313     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314 
315     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 #else
321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322 {
323   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324   PetscInt                           n                  = A->rmap->n;
325   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328   const MatScalar                   *aa = a->a, *v;
329   PetscInt                          *AiLo, *AjLo;
330   PetscInt                           i, nz, nzLower, offset, rowOffset;
331 
332   PetscFunctionBegin;
333   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335     try {
336       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337       nzLower = n + ai[n] - ai[1];
338       if (!loTriFactor) {
339         PetscScalar *AALo;
340 
341         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
342 
343         /* Allocate Space for the lower triangular matrix */
344         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
346 
347         /* Fill the lower triangular matrix */
348         AiLo[0]   = (PetscInt)0;
349         AiLo[n]   = nzLower;
350         AjLo[0]   = (PetscInt)0;
351         AALo[0]   = (MatScalar)1.0;
352         v         = aa;
353         vi        = aj;
354         offset    = 1;
355         rowOffset = 1;
356         for (i = 1; i < n; i++) {
357           nz = ai[i + 1] - ai[i];
358           /* additional 1 for the term on the diagonal */
359           AiLo[i] = rowOffset;
360           rowOffset += nz + 1;
361 
362           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
364 
365           offset += nz;
366           AjLo[offset] = (PetscInt)i;
367           AALo[offset] = (MatScalar)1.0;
368           offset += 1;
369 
370           v += nz;
371           vi += nz;
372         }
373 
374         /* allocate space for the triangular factor information */
375         PetscCall(PetscNew(&loTriFactor));
376         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377         /* Create the matrix description */
378         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382   #else
383         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384   #endif
385         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387 
388         /* set the operation */
389         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390 
391         /* set the matrix */
392         loTriFactor->csrMat              = new CsrMatrix;
393         loTriFactor->csrMat->num_rows    = n;
394         loTriFactor->csrMat->num_cols    = n;
395         loTriFactor->csrMat->num_entries = nzLower;
396 
397         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399 
400         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402 
403         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405 
406         /* Create the solve analysis information */
407         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413   #endif
414 
415         /* perform the solve analysis */
416         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418         PetscCallCUDA(WaitForCUDA());
419         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420 
421         /* assign the pointer */
422         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423         loTriFactor->AA_h                                          = AALo;
424         PetscCallCUDA(cudaFreeHost(AiLo));
425         PetscCallCUDA(cudaFreeHost(AjLo));
426         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427       } else { /* update values only */
428         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429         /* Fill the lower triangular matrix */
430         loTriFactor->AA_h[0] = 1.0;
431         v                    = aa;
432         vi                   = aj;
433         offset               = 1;
434         for (i = 1; i < n; i++) {
435           nz = ai[i + 1] - ai[i];
436           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437           offset += nz;
438           loTriFactor->AA_h[offset] = 1.0;
439           offset += 1;
440           v += nz;
441         }
442         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444       }
445     } catch (char *ex) {
446       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447     }
448   }
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453 {
454   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455   PetscInt                           n                  = A->rmap->n;
456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459   const MatScalar                   *aa = a->a, *v;
460   PetscInt                          *AiUp, *AjUp;
461   PetscInt                           i, nz, nzUpper, offset;
462 
463   PetscFunctionBegin;
464   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466     try {
467       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468       nzUpper = adiag[0] - adiag[n];
469       if (!upTriFactor) {
470         PetscScalar *AAUp;
471 
472         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
473 
474         /* Allocate Space for the upper triangular matrix */
475         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
477 
478         /* Fill the upper triangular matrix */
479         AiUp[0] = (PetscInt)0;
480         AiUp[n] = nzUpper;
481         offset  = nzUpper;
482         for (i = n - 1; i >= 0; i--) {
483           v  = aa + adiag[i + 1] + 1;
484           vi = aj + adiag[i + 1] + 1;
485 
486           /* number of elements NOT on the diagonal */
487           nz = adiag[i] - adiag[i + 1] - 1;
488 
489           /* decrement the offset */
490           offset -= (nz + 1);
491 
492           /* first, set the diagonal elements */
493           AjUp[offset] = (PetscInt)i;
494           AAUp[offset] = (MatScalar)1. / v[nz];
495           AiUp[i]      = AiUp[i + 1] - (nz + 1);
496 
497           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499         }
500 
501         /* allocate space for the triangular factor information */
502         PetscCall(PetscNew(&upTriFactor));
503         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
504 
505         /* Create the matrix description */
506         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510   #else
511         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512   #endif
513         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515 
516         /* set the operation */
517         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518 
519         /* set the matrix */
520         upTriFactor->csrMat              = new CsrMatrix;
521         upTriFactor->csrMat->num_rows    = n;
522         upTriFactor->csrMat->num_cols    = n;
523         upTriFactor->csrMat->num_entries = nzUpper;
524 
525         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527 
528         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530 
531         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533 
534         /* Create the solve analysis information */
535         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541   #endif
542 
543         /* perform the solve analysis */
544         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
546 
547         PetscCallCUDA(WaitForCUDA());
548         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549 
550         /* assign the pointer */
551         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552         upTriFactor->AA_h                                          = AAUp;
553         PetscCallCUDA(cudaFreeHost(AiUp));
554         PetscCallCUDA(cudaFreeHost(AjUp));
555         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556       } else {
557         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558         /* Fill the upper triangular matrix */
559         offset = nzUpper;
560         for (i = n - 1; i >= 0; i--) {
561           v = aa + adiag[i + 1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i + 1] - 1;
565 
566           /* decrement the offset */
567           offset -= (nz + 1);
568 
569           /* first, set the diagonal elements */
570           upTriFactor->AA_h[offset] = 1. / v[nz];
571           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572         }
573         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575       }
576     } catch (char *ex) {
577       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578     }
579   }
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 #endif
583 
584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585 {
586   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588   IS                            isrow = a->row, iscol = a->icol;
589   PetscBool                     row_identity, col_identity;
590   PetscInt                      n = A->rmap->n;
591 
592   PetscFunctionBegin;
593   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596 #else
597   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600 #endif
601 
602   cusparseTriFactors->nnz = a->nz;
603 
604   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605   /* lower triangular indices */
606   PetscCall(ISIdentity(isrow, &row_identity));
607   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608     const PetscInt *r;
609 
610     PetscCall(ISGetIndices(isrow, &r));
611     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612     cusparseTriFactors->rpermIndices->assign(r, r + n);
613     PetscCall(ISRestoreIndices(isrow, &r));
614     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615   }
616 
617   /* upper triangular indices */
618   PetscCall(ISIdentity(iscol, &col_identity));
619   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620     const PetscInt *c;
621 
622     PetscCall(ISGetIndices(iscol, &c));
623     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624     cusparseTriFactors->cpermIndices->assign(c, c + n);
625     PetscCall(ISRestoreIndices(iscol, &c));
626     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633 {
634   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635   PetscInt                      m  = A->rmap->n;
636   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638   const MatScalar              *Aa = a->a;
639   PetscInt                     *Mj, Mnz;
640   PetscScalar                  *Ma, *D;
641 
642   PetscFunctionBegin;
643   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647       Mnz = Ai[m]; // Unz (with the unit diagonal)
648       PetscCall(PetscMalloc1(Mnz, &Ma));
649       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651       for (PetscInt i = 0; i < m; i++) {
652         PetscInt ulen = Ai[i + 1] - Ai[i];
653         Mj[Ai[i]]     = i;                                              // diagonal entry
654         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655       }
656       // Copy M (U) from host to device
657       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663 
664       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672 
673       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676 
677       // Allocate work vectors in SpSv
678       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
680 
681       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683 
684       // Query buffer sizes for SpSV and then allocate buffers
685       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688 
689       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692 
693       // Record for reuse
694       fs->csrVal_h = Ma;
695       fs->diag_h   = D;
696       PetscCall(PetscFree(Mj));
697     }
698     // Copy the value
699     Ma  = fs->csrVal_h;
700     D   = fs->diag_h;
701     Mnz = Ai[m];
702     for (PetscInt i = 0; i < m; i++) {
703       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706     }
707     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709 
710     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 // Solve Ut D U x = b
718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719 {
720   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722   const PetscScalar                    *barray;
723   PetscScalar                          *xarray;
724   thrust::device_ptr<const PetscScalar> bGPU;
725   thrust::device_ptr<PetscScalar>       xGPU;
726   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727   PetscInt                              m   = A->rmap->n;
728 
729   PetscFunctionBegin;
730   PetscCall(PetscLogGpuTimeBegin());
731   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732   PetscCall(VecCUDAGetArrayRead(b, &barray));
733   xGPU = thrust::device_pointer_cast(xarray);
734   bGPU = thrust::device_pointer_cast(barray);
735 
736   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737   if (fs->rpermIndices) {
738     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740   } else {
741     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742   }
743 
744   // Solve Ut Y = X
745   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747 
748   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749   // It is basically a vector element-wise multiplication, but cublas does not have it!
750   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   /* Factorize fact inplace */
1719   if (m)
1720     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722   if (PetscDefined(USE_DEBUG)) {
1723     int              numerical_zero;
1724     cusparseStatus_t status;
1725     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727   }
1728 
1729   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731   */
1732   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1733 
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1735 
1736   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1738 
1739   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742   fact->ops->matsolve          = NULL;
1743   fact->ops->matsolvetranspose = NULL;
1744   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745   PetscFunctionReturn(PETSC_SUCCESS);
1746 }
1747 
1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749 {
1750   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752   PetscInt                      m, nz;
1753 
1754   PetscFunctionBegin;
1755   if (PetscDefined(USE_DEBUG)) {
1756     PetscInt  i;
1757     PetscBool flg, missing;
1758 
1759     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764   }
1765 
1766   /* Free the old stale stuff */
1767   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1768 
1769   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770      but they will not be used. Allocate them just for easy debugging.
1771    */
1772   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1773 
1774   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775   fact->factortype             = MAT_FACTOR_ILU;
1776   fact->info.factor_mallocs    = 0;
1777   fact->info.fill_ratio_given  = info->fill;
1778   fact->info.fill_ratio_needed = 1.0;
1779 
1780   aij->row = NULL;
1781   aij->col = NULL;
1782 
1783   /* ====================================================================== */
1784   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785   /* We'll do in-place factorization on fact                                */
1786   /* ====================================================================== */
1787   const int *Ai, *Aj;
1788 
1789   m  = fact->rmap->n;
1790   nz = aij->nz;
1791 
1792   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1798 
1799   /* ====================================================================== */
1800   /* Create descriptors for M, L, U                                         */
1801   /* ====================================================================== */
1802   cusparseFillMode_t fillMode;
1803   cusparseDiagType_t diagType;
1804 
1805   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1808 
1809   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814   */
1815   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1820 
1821   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826 
1827   /* ========================================================================= */
1828   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829   /* ========================================================================= */
1830   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831   if (m)
1832     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1834 
1835   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1837 
1838   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1840 
1841   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1846 
1847   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851    */
1852   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854     fs->spsvBuffer_L = fs->factBuffer_M;
1855     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856   } else {
1857     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858     fs->spsvBuffer_U = fs->factBuffer_M;
1859     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860   }
1861 
1862   /* ========================================================================== */
1863   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865   /* ========================================================================== */
1866   int              structural_zero;
1867   cusparseStatus_t status;
1868 
1869   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870   if (m)
1871     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873   if (PetscDefined(USE_DEBUG)) {
1874     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877   }
1878 
1879   /* Estimate FLOPs of the numeric factorization */
1880   {
1881     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1882     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883     PetscLogDouble flops = 0.0;
1884 
1885     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886     Ai    = Aseq->i;
1887     Adiag = Aseq->diag;
1888     for (PetscInt i = 0; i < m; i++) {
1889       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890         nzRow  = Ai[i + 1] - Ai[i];
1891         nzLeft = Adiag[i] - Ai[i];
1892         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894         */
1895         nzLeft = (nzRow - 1) / 2;
1896         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897       }
1898     }
1899     fs->numericFactFlops = flops;
1900   }
1901   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902   PetscFunctionReturn(PETSC_SUCCESS);
1903 }
1904 
1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906 {
1907   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909   const PetscScalar            *barray;
1910   PetscScalar                  *xarray;
1911 
1912   PetscFunctionBegin;
1913   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915   PetscCall(PetscLogGpuTimeBegin());
1916 
1917   /* Solve L*y = b */
1918   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1922 
1923   /* Solve Lt*x = y */
1924   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1927 
1928   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1930 
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937 {
1938   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941   CsrMatrix                    *Acsr;
1942   PetscInt                      m, nz;
1943   PetscBool                     flg;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949   }
1950 
1951   /* Copy A's value to fact */
1952   m  = fact->rmap->n;
1953   nz = aij->nz;
1954   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1957 
1958   /* Factorize fact inplace */
1959   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964    */
1965   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966   if (PetscDefined(USE_DEBUG)) {
1967     int              numerical_zero;
1968     cusparseStatus_t status;
1969     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971   }
1972 
1973   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1974 
1975   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977   */
1978   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1979 
1980   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983   fact->ops->matsolve          = NULL;
1984   fact->ops->matsolvetranspose = NULL;
1985   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990 {
1991   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993   PetscInt                      m, nz;
1994 
1995   PetscFunctionBegin;
1996   if (PetscDefined(USE_DEBUG)) {
1997     PetscInt  i;
1998     PetscBool flg, missing;
1999 
2000     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005   }
2006 
2007   /* Free the old stale stuff */
2008   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2009 
2010   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011      but they will not be used. Allocate them just for easy debugging.
2012    */
2013   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2014 
2015   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016   fact->factortype             = MAT_FACTOR_ICC;
2017   fact->info.factor_mallocs    = 0;
2018   fact->info.fill_ratio_given  = info->fill;
2019   fact->info.fill_ratio_needed = 1.0;
2020 
2021   aij->row = NULL;
2022   aij->col = NULL;
2023 
2024   /* ====================================================================== */
2025   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026   /* We'll do in-place factorization on fact                                */
2027   /* ====================================================================== */
2028   const int *Ai, *Aj;
2029 
2030   m  = fact->rmap->n;
2031   nz = aij->nz;
2032 
2033   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2039 
2040   /* ====================================================================== */
2041   /* Create mat descriptors for M, L                                        */
2042   /* ====================================================================== */
2043   cusparseFillMode_t fillMode;
2044   cusparseDiagType_t diagType;
2045 
2046   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2049 
2050   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055   */
2056   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2061 
2062   /* ========================================================================= */
2063   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064   /* ========================================================================= */
2065   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2067 
2068   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2070 
2071   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2073 
2074   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2079 
2080   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082    */
2083   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085     fs->spsvBuffer_L = fs->factBuffer_M;
2086     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087   } else {
2088     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089     fs->spsvBuffer_Lt = fs->factBuffer_M;
2090     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091   }
2092 
2093   /* ========================================================================== */
2094   /* Perform analysis of ic0 on M                                               */
2095   /* The lower triangular part of M has the same sparsity pattern as L          */
2096   /* ========================================================================== */
2097   int              structural_zero;
2098   cusparseStatus_t status;
2099 
2100   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102   if (PetscDefined(USE_DEBUG)) {
2103     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106   }
2107 
2108   /* Estimate FLOPs of the numeric factorization */
2109   {
2110     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2111     PetscInt      *Ai, nzRow, nzLeft;
2112     PetscLogDouble flops = 0.0;
2113 
2114     Ai = Aseq->i;
2115     for (PetscInt i = 0; i < m; i++) {
2116       nzRow = Ai[i + 1] - Ai[i];
2117       if (nzRow > 1) {
2118         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120         */
2121         nzLeft = (nzRow - 1) / 2;
2122         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123       }
2124     }
2125     fs->numericFactFlops = flops;
2126   }
2127   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 #endif
2131 
2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133 {
2134   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2135   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2136 
2137   PetscFunctionBegin;
2138   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2139   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2140   B->offloadmask = PETSC_OFFLOAD_CPU;
2141 
2142   if (!cusparsestruct->use_cpu_solve) {
2143 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2144     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2145     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2146 #else
2147     /* determine which version of MatSolve needs to be used. */
2148     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2149     IS          isrow = b->row, iscol = b->col;
2150     PetscBool   row_identity, col_identity;
2151 
2152     PetscCall(ISIdentity(isrow, &row_identity));
2153     PetscCall(ISIdentity(iscol, &col_identity));
2154     if (row_identity && col_identity) {
2155       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2156       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2157     } else {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2160     }
2161 #endif
2162   }
2163   B->ops->matsolve          = NULL;
2164   B->ops->matsolvetranspose = NULL;
2165 
2166   /* get the triangular factors */
2167   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 
2171 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2172 {
2173   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2174 
2175   PetscFunctionBegin;
2176   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2177   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2178   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2179   PetscFunctionReturn(PETSC_SUCCESS);
2180 }
2181 
2182 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2183 {
2184   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2185 
2186   PetscFunctionBegin;
2187 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2188   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2189   if (cusparseTriFactors->factorizeOnDevice) {
2190     PetscCall(ISIdentity(isrow, &row_identity));
2191     PetscCall(ISIdentity(iscol, &col_identity));
2192   }
2193   if (!info->levels && row_identity && col_identity) {
2194     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2195   } else
2196 #endif
2197   {
2198     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2199     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2200     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2201   }
2202   PetscFunctionReturn(PETSC_SUCCESS);
2203 }
2204 
2205 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2206 {
2207   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2208 
2209   PetscFunctionBegin;
2210 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2211   PetscBool perm_identity = PETSC_FALSE;
2212   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2213   if (!info->levels && perm_identity) {
2214     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2215   } else
2216 #endif
2217   {
2218     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2219     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2220     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2221   }
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2226 {
2227   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2228 
2229   PetscFunctionBegin;
2230   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2231   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2232   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2233   PetscFunctionReturn(PETSC_SUCCESS);
2234 }
2235 
2236 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2237 {
2238   PetscFunctionBegin;
2239   *type = MATSOLVERCUSPARSE;
2240   PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242 
2243 /*MC
2244   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2245   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2246   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2247   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2248   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2249   algorithms are not recommended. This class does NOT support direct solver operations.
2250 
2251   Level: beginner
2252 
2253 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2254           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2255 M*/
2256 
2257 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2258 {
2259   PetscInt  n = A->rmap->n;
2260   PetscBool factOnDevice, factOnHost;
2261   char     *prefix;
2262   char      factPlace[32] = "device"; /* the default */
2263 
2264   PetscFunctionBegin;
2265   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266   PetscCall(MatSetSizes(*B, n, n, n, n));
2267   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2269 
2270   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2271   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2272   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2273   PetscOptionsEnd();
2274   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2275   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2276   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2277   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2278 
2279   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2280   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2281     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2282     if (!A->boundtocpu) {
2283       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2284       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2285     } else {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2288     }
2289     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2290     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2291     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2292   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2293     if (!A->boundtocpu) {
2294       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2295       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2296     } else {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2299     }
2300     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2301     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2302   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2303 
2304   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2305   (*B)->canuseordering = PETSC_TRUE;
2306   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2307   PetscFunctionReturn(PETSC_SUCCESS);
2308 }
2309 
2310 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2311 {
2312   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2313   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2315   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2316 #endif
2317 
2318   PetscFunctionBegin;
2319   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2320     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2321     if (A->factortype == MAT_FACTOR_NONE) {
2322       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2323       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2324     }
2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326     else if (fs->csrVal) {
2327       /* We have a factorized matrix on device and are able to copy it to host */
2328       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329     }
2330 #endif
2331     else
2332       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2333     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2334     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2335     A->offloadmask = PETSC_OFFLOAD_BOTH;
2336   }
2337   PetscFunctionReturn(PETSC_SUCCESS);
2338 }
2339 
2340 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2341 {
2342   PetscFunctionBegin;
2343   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2344   *array = ((Mat_SeqAIJ *)A->data)->a;
2345   PetscFunctionReturn(PETSC_SUCCESS);
2346 }
2347 
2348 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2349 {
2350   PetscFunctionBegin;
2351   A->offloadmask = PETSC_OFFLOAD_CPU;
2352   *array         = NULL;
2353   PetscFunctionReturn(PETSC_SUCCESS);
2354 }
2355 
2356 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2357 {
2358   PetscFunctionBegin;
2359   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2360   *array = ((Mat_SeqAIJ *)A->data)->a;
2361   PetscFunctionReturn(PETSC_SUCCESS);
2362 }
2363 
2364 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2365 {
2366   PetscFunctionBegin;
2367   *array = NULL;
2368   PetscFunctionReturn(PETSC_SUCCESS);
2369 }
2370 
2371 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2372 {
2373   PetscFunctionBegin;
2374   *array = ((Mat_SeqAIJ *)A->data)->a;
2375   PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377 
2378 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379 {
2380   PetscFunctionBegin;
2381   A->offloadmask = PETSC_OFFLOAD_CPU;
2382   *array         = NULL;
2383   PetscFunctionReturn(PETSC_SUCCESS);
2384 }
2385 
2386 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2387 {
2388   Mat_SeqAIJCUSPARSE *cusp;
2389   CsrMatrix          *matrix;
2390 
2391   PetscFunctionBegin;
2392   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2393   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2394   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2395   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2396   matrix = (CsrMatrix *)cusp->mat->mat;
2397 
2398   if (i) {
2399 #if !defined(PETSC_USE_64BIT_INDICES)
2400     *i = matrix->row_offsets->data().get();
2401 #else
2402     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2403 #endif
2404   }
2405   if (j) {
2406 #if !defined(PETSC_USE_64BIT_INDICES)
2407     *j = matrix->column_indices->data().get();
2408 #else
2409     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410 #endif
2411   }
2412   if (a) *a = matrix->values->data().get();
2413   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2414   PetscFunctionReturn(PETSC_SUCCESS);
2415 }
2416 
2417 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2418 {
2419   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2420   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2421   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2422   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2423   cusparseStatus_t              stat;
2424   PetscBool                     both = PETSC_TRUE;
2425 
2426   PetscFunctionBegin;
2427   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2428   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2429     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2430       CsrMatrix *matrix;
2431       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2432 
2433       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2434       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2435       matrix->values->assign(a->a, a->a + a->nz);
2436       PetscCallCUDA(WaitForCUDA());
2437       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2438       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2439       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2440     } else {
2441       PetscInt nnz;
2442       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2443       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2444       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2445       delete cusparsestruct->workVector;
2446       delete cusparsestruct->rowoffsets_gpu;
2447       cusparsestruct->workVector     = NULL;
2448       cusparsestruct->rowoffsets_gpu = NULL;
2449       try {
2450         if (a->compressedrow.use) {
2451           m    = a->compressedrow.nrows;
2452           ii   = a->compressedrow.i;
2453           ridx = a->compressedrow.rindex;
2454         } else {
2455           m    = A->rmap->n;
2456           ii   = a->i;
2457           ridx = NULL;
2458         }
2459         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2460         if (!a->a) {
2461           nnz  = ii[m];
2462           both = PETSC_FALSE;
2463         } else nnz = a->nz;
2464         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2465 
2466         /* create cusparse matrix */
2467         cusparsestruct->nrows = m;
2468         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2469         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2470         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2471         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2472 
2473         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2474         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2475         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2480 
2481         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2482         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2483           /* set the matrix */
2484           CsrMatrix *mat   = new CsrMatrix;
2485           mat->num_rows    = m;
2486           mat->num_cols    = A->cmap->n;
2487           mat->num_entries = nnz;
2488           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2489           mat->row_offsets->assign(ii, ii + m + 1);
2490 
2491           mat->column_indices = new THRUSTINTARRAY32(nnz);
2492           mat->column_indices->assign(a->j, a->j + nnz);
2493 
2494           mat->values = new THRUSTARRAY(nnz);
2495           if (a->a) mat->values->assign(a->a, a->a + nnz);
2496 
2497           /* assign the pointer */
2498           matstruct->mat = mat;
2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503             PetscCallCUSPARSE(stat);
2504           }
2505 #endif
2506         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509 #else
2510           CsrMatrix *mat   = new CsrMatrix;
2511           mat->num_rows    = m;
2512           mat->num_cols    = A->cmap->n;
2513           mat->num_entries = nnz;
2514           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2515           mat->row_offsets->assign(ii, ii + m + 1);
2516 
2517           mat->column_indices = new THRUSTINTARRAY32(nnz);
2518           mat->column_indices->assign(a->j, a->j + nnz);
2519 
2520           mat->values = new THRUSTARRAY(nnz);
2521           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522 
2523           cusparseHybMat_t hybMat;
2524           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527           PetscCallCUSPARSE(stat);
2528           /* assign the pointer */
2529           matstruct->mat = hybMat;
2530 
2531           if (mat) {
2532             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535             delete (CsrMatrix *)mat;
2536           }
2537 #endif
2538         }
2539 
2540         /* assign the compressed row indices */
2541         if (a->compressedrow.use) {
2542           cusparsestruct->workVector = new THRUSTARRAY(m);
2543           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2544           matstruct->cprowIndices->assign(ridx, ridx + m);
2545           tmp = m;
2546         } else {
2547           cusparsestruct->workVector = NULL;
2548           matstruct->cprowIndices    = NULL;
2549           tmp                        = 0;
2550         }
2551         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2552 
2553         /* assign the pointer */
2554         cusparsestruct->mat = matstruct;
2555       } catch (char *ex) {
2556         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557       }
2558       PetscCallCUDA(WaitForCUDA());
2559       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560       cusparsestruct->nonzerostate = A->nonzerostate;
2561     }
2562     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563   }
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 struct VecCUDAPlusEquals {
2568   template <typename Tuple>
2569   __host__ __device__ void operator()(Tuple t)
2570   {
2571     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572   }
2573 };
2574 
2575 struct VecCUDAEquals {
2576   template <typename Tuple>
2577   __host__ __device__ void operator()(Tuple t)
2578   {
2579     thrust::get<1>(t) = thrust::get<0>(t);
2580   }
2581 };
2582 
2583 struct VecCUDAEqualsReverse {
2584   template <typename Tuple>
2585   __host__ __device__ void operator()(Tuple t)
2586   {
2587     thrust::get<0>(t) = thrust::get<1>(t);
2588   }
2589 };
2590 
2591 struct MatMatCusparse {
2592   PetscBool      cisdense;
2593   PetscScalar   *Bt;
2594   Mat            X;
2595   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596   PetscLogDouble flops;
2597   CsrMatrix     *Bcsr;
2598 
2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600   cusparseSpMatDescr_t matSpBDescr;
2601   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602   cusparseDnMatDescr_t matBDescr;
2603   cusparseDnMatDescr_t matCDescr;
2604   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606   void *dBuffer4;
2607   void *dBuffer5;
2608   #endif
2609   size_t                mmBufferSize;
2610   void                 *mmBuffer;
2611   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612   cusparseSpGEMMDescr_t spgemmDesc;
2613 #endif
2614 };
2615 
2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617 {
2618   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2619 
2620   PetscFunctionBegin;
2621   PetscCallCUDA(cudaFree(mmdata->Bt));
2622   delete mmdata->Bcsr;
2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631   #endif
2632   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634 #endif
2635   PetscCall(MatDestroy(&mmdata->X));
2636   PetscCall(PetscFree(data));
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2641 
2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643 {
2644   Mat_Product                  *product = C->product;
2645   Mat                           A, B;
2646   PetscInt                      m, n, blda, clda;
2647   PetscBool                     flg, biscuda;
2648   Mat_SeqAIJCUSPARSE           *cusp;
2649   cusparseStatus_t              stat;
2650   cusparseOperation_t           opA;
2651   const PetscScalar            *barray;
2652   PetscScalar                  *carray;
2653   MatMatCusparse               *mmdata;
2654   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655   CsrMatrix                    *csrmat;
2656 
2657   PetscFunctionBegin;
2658   MatCheckProduct(C, 1);
2659   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660   mmdata = (MatMatCusparse *)product->data;
2661   A      = product->A;
2662   B      = product->B;
2663   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670   switch (product->type) {
2671   case MATPRODUCT_AB:
2672   case MATPRODUCT_PtAP:
2673     mat = cusp->mat;
2674     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675     m   = A->rmap->n;
2676     n   = B->cmap->n;
2677     break;
2678   case MATPRODUCT_AtB:
2679     if (!A->form_explicit_transpose) {
2680       mat = cusp->mat;
2681       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682     } else {
2683       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684       mat = cusp->matTranspose;
2685       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686     }
2687     m = A->cmap->n;
2688     n = B->cmap->n;
2689     break;
2690   case MATPRODUCT_ABt:
2691   case MATPRODUCT_RARt:
2692     mat = cusp->mat;
2693     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694     m   = A->rmap->n;
2695     n   = B->rmap->n;
2696     break;
2697   default:
2698     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699   }
2700   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701   csrmat = (CsrMatrix *)mat->mat;
2702   /* if the user passed a CPU matrix, copy the data to the GPU */
2703   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2706 
2707   PetscCall(MatDenseGetLDA(B, &blda));
2708   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711   } else {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(C, &clda));
2714   }
2715 
2716   PetscCall(PetscLogGpuTimeBegin());
2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2720   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2721     size_t mmBufferSize;
2722     if (mmdata->initialized && mmdata->Blda != blda) {
2723       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2724       mmdata->matBDescr = NULL;
2725     }
2726     if (!mmdata->matBDescr) {
2727       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2728       mmdata->Blda = blda;
2729     }
2730 
2731     if (mmdata->initialized && mmdata->Clda != clda) {
2732       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2733       mmdata->matCDescr = NULL;
2734     }
2735     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2736       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737       mmdata->Clda = clda;
2738     }
2739 
2740     if (!mat->matDescr) {
2741       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2742                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2743       PetscCallCUSPARSE(stat);
2744     }
2745     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2746     PetscCallCUSPARSE(stat);
2747     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2748       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2749       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2750       mmdata->mmBufferSize = mmBufferSize;
2751     }
2752     mmdata->initialized = PETSC_TRUE;
2753   } else {
2754     /* to be safe, always update pointers of the mats */
2755     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2756     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2757     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2758   }
2759 
2760   /* do cusparseSpMM, which supports transpose on B */
2761   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2762   PetscCallCUSPARSE(stat);
2763 #else
2764   PetscInt k;
2765   /* cusparseXcsrmm does not support transpose on B */
2766   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2767     cublasHandle_t cublasv2handle;
2768     cublasStatus_t cerr;
2769 
2770     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2771     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2772     PetscCallCUBLAS(cerr);
2773     blda = B->cmap->n;
2774     k    = B->cmap->n;
2775   } else {
2776     k = B->rmap->n;
2777   }
2778 
2779   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2780   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2781   PetscCallCUSPARSE(stat);
2782 #endif
2783   PetscCall(PetscLogGpuTimeEnd());
2784   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2785   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2786   if (product->type == MATPRODUCT_RARt) {
2787     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2788     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2789   } else if (product->type == MATPRODUCT_PtAP) {
2790     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2792   } else {
2793     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2794   }
2795   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2796   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2797   PetscFunctionReturn(PETSC_SUCCESS);
2798 }
2799 
2800 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2801 {
2802   Mat_Product        *product = C->product;
2803   Mat                 A, B;
2804   PetscInt            m, n;
2805   PetscBool           cisdense, flg;
2806   MatMatCusparse     *mmdata;
2807   Mat_SeqAIJCUSPARSE *cusp;
2808 
2809   PetscFunctionBegin;
2810   MatCheckProduct(C, 1);
2811   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2812   A = product->A;
2813   B = product->B;
2814   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2815   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2816   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2817   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2818   switch (product->type) {
2819   case MATPRODUCT_AB:
2820     m = A->rmap->n;
2821     n = B->cmap->n;
2822     break;
2823   case MATPRODUCT_AtB:
2824     m = A->cmap->n;
2825     n = B->cmap->n;
2826     break;
2827   case MATPRODUCT_ABt:
2828     m = A->rmap->n;
2829     n = B->rmap->n;
2830     break;
2831   case MATPRODUCT_PtAP:
2832     m = B->cmap->n;
2833     n = B->cmap->n;
2834     break;
2835   case MATPRODUCT_RARt:
2836     m = B->rmap->n;
2837     n = B->rmap->n;
2838     break;
2839   default:
2840     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2841   }
2842   PetscCall(MatSetSizes(C, m, n, m, n));
2843   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2844   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2845   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2846 
2847   /* product data */
2848   PetscCall(PetscNew(&mmdata));
2849   mmdata->cisdense = cisdense;
2850 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2851   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2852   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2853 #endif
2854   /* for these products we need intermediate storage */
2855   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2856     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2857     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2858     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2859       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2860     } else {
2861       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2862     }
2863   }
2864   C->product->data    = mmdata;
2865   C->product->destroy = MatDestroy_MatMatCusparse;
2866 
2867   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2868   PetscFunctionReturn(PETSC_SUCCESS);
2869 }
2870 
2871 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2872 {
2873   Mat_Product                  *product = C->product;
2874   Mat                           A, B;
2875   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2876   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2877   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2878   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2879   PetscBool                     flg;
2880   cusparseStatus_t              stat;
2881   MatProductType                ptype;
2882   MatMatCusparse               *mmdata;
2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884   cusparseSpMatDescr_t BmatSpDescr;
2885 #endif
2886   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2887 
2888   PetscFunctionBegin;
2889   MatCheckProduct(C, 1);
2890   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2891   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2892   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2893   mmdata = (MatMatCusparse *)C->product->data;
2894   A      = product->A;
2895   B      = product->B;
2896   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2897     mmdata->reusesym = PETSC_FALSE;
2898     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2899     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2900     Cmat = Ccusp->mat;
2901     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2902     Ccsr = (CsrMatrix *)Cmat->mat;
2903     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2904     goto finalize;
2905   }
2906   if (!c->nz) goto finalize;
2907   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2908   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2909   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2910   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2911   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2913   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2914   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2915   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2916   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2919   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2920   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2921 
2922   ptype = product->type;
2923   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2924     ptype = MATPRODUCT_AB;
2925     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2926   }
2927   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2928     ptype = MATPRODUCT_AB;
2929     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2930   }
2931   switch (ptype) {
2932   case MATPRODUCT_AB:
2933     Amat = Acusp->mat;
2934     Bmat = Bcusp->mat;
2935     break;
2936   case MATPRODUCT_AtB:
2937     Amat = Acusp->matTranspose;
2938     Bmat = Bcusp->mat;
2939     break;
2940   case MATPRODUCT_ABt:
2941     Amat = Acusp->mat;
2942     Bmat = Bcusp->matTranspose;
2943     break;
2944   default:
2945     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2946   }
2947   Cmat = Ccusp->mat;
2948   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2949   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2950   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2951   Acsr = (CsrMatrix *)Amat->mat;
2952   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2953   Ccsr = (CsrMatrix *)Cmat->mat;
2954   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2955   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2956   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957   PetscCall(PetscLogGpuTimeBegin());
2958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2959   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2960   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2961   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2962   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2963   PetscCallCUSPARSE(stat);
2964   #else
2965   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2966   PetscCallCUSPARSE(stat);
2967   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2968   PetscCallCUSPARSE(stat);
2969   #endif
2970 #else
2971   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2972                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2973   PetscCallCUSPARSE(stat);
2974 #endif
2975   PetscCall(PetscLogGpuFlops(mmdata->flops));
2976   PetscCallCUDA(WaitForCUDA());
2977   PetscCall(PetscLogGpuTimeEnd());
2978   C->offloadmask = PETSC_OFFLOAD_GPU;
2979 finalize:
2980   /* shorter version of MatAssemblyEnd_SeqAIJ */
2981   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2982   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2983   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2984   c->reallocs = 0;
2985   C->info.mallocs += 0;
2986   C->info.nz_unneeded = 0;
2987   C->assembled = C->was_assembled = PETSC_TRUE;
2988   C->num_ass++;
2989   PetscFunctionReturn(PETSC_SUCCESS);
2990 }
2991 
2992 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2993 {
2994   Mat_Product                  *product = C->product;
2995   Mat                           A, B;
2996   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2997   Mat_SeqAIJ                   *a, *b, *c;
2998   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2999   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3000   PetscInt                      i, j, m, n, k;
3001   PetscBool                     flg;
3002   cusparseStatus_t              stat;
3003   MatProductType                ptype;
3004   MatMatCusparse               *mmdata;
3005   PetscLogDouble                flops;
3006   PetscBool                     biscompressed, ciscompressed;
3007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3008   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3009   cusparseSpMatDescr_t BmatSpDescr;
3010 #else
3011   int cnz;
3012 #endif
3013   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3014 
3015   PetscFunctionBegin;
3016   MatCheckProduct(C, 1);
3017   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3018   A = product->A;
3019   B = product->B;
3020   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3021   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3022   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3023   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3024   a = (Mat_SeqAIJ *)A->data;
3025   b = (Mat_SeqAIJ *)B->data;
3026   /* product data */
3027   PetscCall(PetscNew(&mmdata));
3028   C->product->data    = mmdata;
3029   C->product->destroy = MatDestroy_MatMatCusparse;
3030 
3031   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3032   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3033   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3034   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3035   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3037 
3038   ptype = product->type;
3039   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3040     ptype                                          = MATPRODUCT_AB;
3041     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3042   }
3043   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3044     ptype                                          = MATPRODUCT_AB;
3045     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3046   }
3047   biscompressed = PETSC_FALSE;
3048   ciscompressed = PETSC_FALSE;
3049   switch (ptype) {
3050   case MATPRODUCT_AB:
3051     m    = A->rmap->n;
3052     n    = B->cmap->n;
3053     k    = A->cmap->n;
3054     Amat = Acusp->mat;
3055     Bmat = Bcusp->mat;
3056     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3057     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3058     break;
3059   case MATPRODUCT_AtB:
3060     m = A->cmap->n;
3061     n = B->cmap->n;
3062     k = A->rmap->n;
3063     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3064     Amat = Acusp->matTranspose;
3065     Bmat = Bcusp->mat;
3066     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3067     break;
3068   case MATPRODUCT_ABt:
3069     m = A->rmap->n;
3070     n = B->rmap->n;
3071     k = A->cmap->n;
3072     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3073     Amat = Acusp->mat;
3074     Bmat = Bcusp->matTranspose;
3075     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3076     break;
3077   default:
3078     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3079   }
3080 
3081   /* create cusparse matrix */
3082   PetscCall(MatSetSizes(C, m, n, m, n));
3083   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3084   c     = (Mat_SeqAIJ *)C->data;
3085   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3086   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3087   Ccsr  = new CsrMatrix;
3088 
3089   c->compressedrow.use = ciscompressed;
3090   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3091     c->compressedrow.nrows = a->compressedrow.nrows;
3092     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3093     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3094     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3095     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3096     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3097   } else {
3098     c->compressedrow.nrows  = 0;
3099     c->compressedrow.i      = NULL;
3100     c->compressedrow.rindex = NULL;
3101     Ccusp->workVector       = NULL;
3102     Cmat->cprowIndices      = NULL;
3103   }
3104   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3105   Ccusp->mat        = Cmat;
3106   Ccusp->mat->mat   = Ccsr;
3107   Ccsr->num_rows    = Ccusp->nrows;
3108   Ccsr->num_cols    = n;
3109   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3110   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3111   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3112   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3113   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3114   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3115   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3116   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3119   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3120     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3121     c->nz                = 0;
3122     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3123     Ccsr->values         = new THRUSTARRAY(c->nz);
3124     goto finalizesym;
3125   }
3126 
3127   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3128   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3129   Acsr = (CsrMatrix *)Amat->mat;
3130   if (!biscompressed) {
3131     Bcsr = (CsrMatrix *)Bmat->mat;
3132 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3133     BmatSpDescr = Bmat->matDescr;
3134 #endif
3135   } else { /* we need to use row offsets for the full matrix */
3136     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3137     Bcsr                 = new CsrMatrix;
3138     Bcsr->num_rows       = B->rmap->n;
3139     Bcsr->num_cols       = cBcsr->num_cols;
3140     Bcsr->num_entries    = cBcsr->num_entries;
3141     Bcsr->column_indices = cBcsr->column_indices;
3142     Bcsr->values         = cBcsr->values;
3143     if (!Bcusp->rowoffsets_gpu) {
3144       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3145       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3146       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3147     }
3148     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3149     mmdata->Bcsr      = Bcsr;
3150 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151     if (Bcsr->num_rows && Bcsr->num_cols) {
3152       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3153       PetscCallCUSPARSE(stat);
3154     }
3155     BmatSpDescr = mmdata->matSpBDescr;
3156 #endif
3157   }
3158   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3159   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3160   /* precompute flops count */
3161   if (ptype == MATPRODUCT_AB) {
3162     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3163       const PetscInt st = a->i[i];
3164       const PetscInt en = a->i[i + 1];
3165       for (j = st; j < en; j++) {
3166         const PetscInt brow = a->j[j];
3167         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3168       }
3169     }
3170   } else if (ptype == MATPRODUCT_AtB) {
3171     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3172       const PetscInt anzi = a->i[i + 1] - a->i[i];
3173       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3174       flops += (2. * anzi) * bnzi;
3175     }
3176   } else { /* TODO */
3177     flops = 0.;
3178   }
3179 
3180   mmdata->flops = flops;
3181   PetscCall(PetscLogGpuTimeBegin());
3182 
3183 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3185   // cuda-12.2 requires non-null csrRowOffsets
3186   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3187   PetscCallCUSPARSE(stat);
3188   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3189   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3190   {
3191     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3192      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3193   */
3194     void *dBuffer1 = NULL;
3195     void *dBuffer2 = NULL;
3196     void *dBuffer3 = NULL;
3197     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3198     size_t bufferSize1 = 0;
3199     size_t bufferSize2 = 0;
3200     size_t bufferSize3 = 0;
3201     size_t bufferSize4 = 0;
3202     size_t bufferSize5 = 0;
3203 
3204     /* ask bufferSize1 bytes for external memory */
3205     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3206     PetscCallCUSPARSE(stat);
3207     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3208     /* inspect the matrices A and B to understand the memory requirement for the next step */
3209     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3210     PetscCallCUSPARSE(stat);
3211 
3212     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3213     PetscCallCUSPARSE(stat);
3214     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3215     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3216     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3217     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3218     PetscCallCUSPARSE(stat);
3219     PetscCallCUDA(cudaFree(dBuffer1));
3220     PetscCallCUDA(cudaFree(dBuffer2));
3221 
3222     /* get matrix C non-zero entries C_nnz1 */
3223     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3224     c->nz = (PetscInt)C_nnz1;
3225     /* allocate matrix C */
3226     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3227     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3228     Ccsr->values = new THRUSTARRAY(c->nz);
3229     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3230     /* update matC with the new pointers */
3231     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3232     PetscCallCUSPARSE(stat);
3233 
3234     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3235     PetscCallCUSPARSE(stat);
3236     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3237     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3238     PetscCallCUSPARSE(stat);
3239     PetscCallCUDA(cudaFree(dBuffer3));
3240     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3241     PetscCallCUSPARSE(stat);
3242     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3243   }
3244   #else
3245   size_t bufSize2;
3246   /* ask bufferSize bytes for external memory */
3247   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3248   PetscCallCUSPARSE(stat);
3249   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3250   /* inspect the matrices A and B to understand the memory requirement for the next step */
3251   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3252   PetscCallCUSPARSE(stat);
3253   /* ask bufferSize again bytes for external memory */
3254   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3255   PetscCallCUSPARSE(stat);
3256   /* The CUSPARSE documentation is not clear, nor the API
3257      We need both buffers to perform the operations properly!
3258      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3259      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3260      is stored in the descriptor! What a messy API... */
3261   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3262   /* compute the intermediate product of A * B */
3263   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3264   PetscCallCUSPARSE(stat);
3265   /* get matrix C non-zero entries C_nnz1 */
3266   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3267   c->nz = (PetscInt)C_nnz1;
3268   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3269                       mmdata->mmBufferSize / 1024));
3270   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3271   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3272   Ccsr->values = new THRUSTARRAY(c->nz);
3273   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3274   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3275   PetscCallCUSPARSE(stat);
3276   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3277   PetscCallCUSPARSE(stat);
3278   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3279 #else
3280   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3281   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3282                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3283   PetscCallCUSPARSE(stat);
3284   c->nz                = cnz;
3285   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3286   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3287   Ccsr->values = new THRUSTARRAY(c->nz);
3288   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3289 
3290   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3291   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3292      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3293      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3294   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3295                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3296   PetscCallCUSPARSE(stat);
3297 #endif
3298   PetscCall(PetscLogGpuFlops(mmdata->flops));
3299   PetscCall(PetscLogGpuTimeEnd());
3300 finalizesym:
3301   c->singlemalloc = PETSC_FALSE;
3302   c->free_a       = PETSC_TRUE;
3303   c->free_ij      = PETSC_TRUE;
3304   PetscCall(PetscMalloc1(m + 1, &c->i));
3305   PetscCall(PetscMalloc1(c->nz, &c->j));
3306   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3307     PetscInt      *d_i = c->i;
3308     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3309     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3310     ii = *Ccsr->row_offsets;
3311     jj = *Ccsr->column_indices;
3312     if (ciscompressed) d_i = c->compressedrow.i;
3313     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3314     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3315   } else {
3316     PetscInt *d_i = c->i;
3317     if (ciscompressed) d_i = c->compressedrow.i;
3318     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3319     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3320   }
3321   if (ciscompressed) { /* need to expand host row offsets */
3322     PetscInt r = 0;
3323     c->i[0]    = 0;
3324     for (k = 0; k < c->compressedrow.nrows; k++) {
3325       const PetscInt next = c->compressedrow.rindex[k];
3326       const PetscInt old  = c->compressedrow.i[k];
3327       for (; r < next; r++) c->i[r + 1] = old;
3328     }
3329     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3330   }
3331   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3332   PetscCall(PetscMalloc1(m, &c->ilen));
3333   PetscCall(PetscMalloc1(m, &c->imax));
3334   c->maxnz         = c->nz;
3335   c->nonzerorowcnt = 0;
3336   c->rmax          = 0;
3337   for (k = 0; k < m; k++) {
3338     const PetscInt nn = c->i[k + 1] - c->i[k];
3339     c->ilen[k] = c->imax[k] = nn;
3340     c->nonzerorowcnt += (PetscInt) !!nn;
3341     c->rmax = PetscMax(c->rmax, nn);
3342   }
3343   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3344   PetscCall(PetscMalloc1(c->nz, &c->a));
3345   Ccsr->num_entries = c->nz;
3346 
3347   C->nonzerostate++;
3348   PetscCall(PetscLayoutSetUp(C->rmap));
3349   PetscCall(PetscLayoutSetUp(C->cmap));
3350   Ccusp->nonzerostate = C->nonzerostate;
3351   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3352   C->preallocated     = PETSC_TRUE;
3353   C->assembled        = PETSC_FALSE;
3354   C->was_assembled    = PETSC_FALSE;
3355   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3356     mmdata->reusesym = PETSC_TRUE;
3357     C->offloadmask   = PETSC_OFFLOAD_GPU;
3358   }
3359   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3360   PetscFunctionReturn(PETSC_SUCCESS);
3361 }
3362 
3363 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3364 
3365 /* handles sparse or dense B */
3366 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3367 {
3368   Mat_Product *product = mat->product;
3369   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3370 
3371   PetscFunctionBegin;
3372   MatCheckProduct(mat, 1);
3373   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3374   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3375   if (product->type == MATPRODUCT_ABC) {
3376     Ciscusp = PETSC_FALSE;
3377     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3378   }
3379   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3380     PetscBool usecpu = PETSC_FALSE;
3381     switch (product->type) {
3382     case MATPRODUCT_AB:
3383       if (product->api_user) {
3384         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3385         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3386         PetscOptionsEnd();
3387       } else {
3388         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3389         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3390         PetscOptionsEnd();
3391       }
3392       break;
3393     case MATPRODUCT_AtB:
3394       if (product->api_user) {
3395         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3396         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3397         PetscOptionsEnd();
3398       } else {
3399         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3400         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3401         PetscOptionsEnd();
3402       }
3403       break;
3404     case MATPRODUCT_PtAP:
3405       if (product->api_user) {
3406         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3407         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3408         PetscOptionsEnd();
3409       } else {
3410         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3411         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3412         PetscOptionsEnd();
3413       }
3414       break;
3415     case MATPRODUCT_RARt:
3416       if (product->api_user) {
3417         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3418         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3419         PetscOptionsEnd();
3420       } else {
3421         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3422         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3423         PetscOptionsEnd();
3424       }
3425       break;
3426     case MATPRODUCT_ABC:
3427       if (product->api_user) {
3428         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3429         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3430         PetscOptionsEnd();
3431       } else {
3432         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3433         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3434         PetscOptionsEnd();
3435       }
3436       break;
3437     default:
3438       break;
3439     }
3440     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3441   }
3442   /* dispatch */
3443   if (isdense) {
3444     switch (product->type) {
3445     case MATPRODUCT_AB:
3446     case MATPRODUCT_AtB:
3447     case MATPRODUCT_ABt:
3448     case MATPRODUCT_PtAP:
3449     case MATPRODUCT_RARt:
3450       if (product->A->boundtocpu) {
3451         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3452       } else {
3453         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3454       }
3455       break;
3456     case MATPRODUCT_ABC:
3457       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3458       break;
3459     default:
3460       break;
3461     }
3462   } else if (Biscusp && Ciscusp) {
3463     switch (product->type) {
3464     case MATPRODUCT_AB:
3465     case MATPRODUCT_AtB:
3466     case MATPRODUCT_ABt:
3467       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3468       break;
3469     case MATPRODUCT_PtAP:
3470     case MATPRODUCT_RARt:
3471     case MATPRODUCT_ABC:
3472       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3473       break;
3474     default:
3475       break;
3476     }
3477   } else { /* fallback for AIJ */
3478     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3479   }
3480   PetscFunctionReturn(PETSC_SUCCESS);
3481 }
3482 
3483 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3484 {
3485   PetscFunctionBegin;
3486   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3487   PetscFunctionReturn(PETSC_SUCCESS);
3488 }
3489 
3490 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3491 {
3492   PetscFunctionBegin;
3493   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3494   PetscFunctionReturn(PETSC_SUCCESS);
3495 }
3496 
3497 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3498 {
3499   PetscFunctionBegin;
3500   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3501   PetscFunctionReturn(PETSC_SUCCESS);
3502 }
3503 
3504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3505 {
3506   PetscFunctionBegin;
3507   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3508   PetscFunctionReturn(PETSC_SUCCESS);
3509 }
3510 
3511 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3512 {
3513   PetscFunctionBegin;
3514   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3515   PetscFunctionReturn(PETSC_SUCCESS);
3516 }
3517 
3518 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3519 {
3520   int i = blockIdx.x * blockDim.x + threadIdx.x;
3521   if (i < n) y[idx[i]] += x[i];
3522 }
3523 
3524 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3525 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3526 {
3527   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3528   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3529   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3530   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3531   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3532   PetscBool                     compressed;
3533 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3534   PetscInt nx, ny;
3535 #endif
3536 
3537   PetscFunctionBegin;
3538   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3539   if (!a->nz) {
3540     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3541     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3542     PetscFunctionReturn(PETSC_SUCCESS);
3543   }
3544   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3545   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3546   if (!trans) {
3547     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3548     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3549   } else {
3550     if (herm || !A->form_explicit_transpose) {
3551       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3552       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3553     } else {
3554       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3555       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3556     }
3557   }
3558   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3559   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3560 
3561   try {
3562     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3563     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3564     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3565 
3566     PetscCall(PetscLogGpuTimeBegin());
3567     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3568       /* z = A x + beta y.
3569          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3570          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3571       */
3572       xptr = xarray;
3573       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3574       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3575 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3576       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3577           allocated to accommodate different uses. So we get the length info directly from mat.
3578        */
3579       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3580         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3581         nx             = mat->num_cols;
3582         ny             = mat->num_rows;
3583       }
3584 #endif
3585     } else {
3586       /* z = A^T x + beta y
3587          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3588          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3589        */
3590       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3591       dptr = zarray;
3592       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3593       if (compressed) { /* Scatter x to work vector */
3594         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3595 
3596         thrust::for_each(
3597 #if PetscDefined(HAVE_THRUST_ASYNC)
3598           thrust::cuda::par.on(PetscDefaultCudaStream),
3599 #endif
3600           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3601           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3602       }
3603 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3604       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3605         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3606         nx             = mat->num_rows;
3607         ny             = mat->num_cols;
3608       }
3609 #endif
3610     }
3611 
3612     /* csr_spmv does y = alpha op(A) x + beta y */
3613     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3614 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3615       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3616       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3617         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3618         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3619         PetscCallCUSPARSE(
3620           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3621         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3622 
3623         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3624       } else {
3625         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3626         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3627         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3628       }
3629 
3630       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3631                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3632 #else
3633       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3635 #endif
3636     } else {
3637       if (cusparsestruct->nrows) {
3638 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3639         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3640 #else
3641         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3642         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3643 #endif
3644       }
3645     }
3646     PetscCall(PetscLogGpuTimeEnd());
3647 
3648     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3649       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3650         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3651           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3652         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3653           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3654         }
3655       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3656         PetscCall(VecSeq_CUDA::Set(zz, 0));
3657       }
3658 
3659       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3660       if (compressed) {
3661         PetscCall(PetscLogGpuTimeBegin());
3662         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3663            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3664            prevent that. So I just add a ScatterAdd kernel.
3665          */
3666 #if 0
3667         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3668         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3669                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3670                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3671                          VecCUDAPlusEquals());
3672 #else
3673         PetscInt n = matstruct->cprowIndices->size();
3674         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3675 #endif
3676         PetscCall(PetscLogGpuTimeEnd());
3677       }
3678     } else {
3679       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3680     }
3681     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3682     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3683     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3684   } catch (char *ex) {
3685     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3686   }
3687   if (yy) {
3688     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3689   } else {
3690     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3691   }
3692   PetscFunctionReturn(PETSC_SUCCESS);
3693 }
3694 
3695 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3696 {
3697   PetscFunctionBegin;
3698   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3699   PetscFunctionReturn(PETSC_SUCCESS);
3700 }
3701 
3702 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3703 {
3704   PetscFunctionBegin;
3705   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3706   PetscFunctionReturn(PETSC_SUCCESS);
3707 }
3708 
3709 /*@
3710   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3711   (the default parallel PETSc format).
3712 
3713   Collective
3714 
3715   Input Parameters:
3716 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3717 . m    - number of rows
3718 . n    - number of columns
3719 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3720 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3721 
3722   Output Parameter:
3723 . A - the matrix
3724 
3725   Level: intermediate
3726 
3727   Notes:
3728   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3729   calculations. For good matrix assembly performance the user should preallocate the matrix
3730   storage by setting the parameter `nz` (or the array `nnz`).
3731 
3732   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3733   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3734   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3735 
3736   The AIJ format, also called
3737   compressed row storage, is fully compatible with standard Fortran
3738   storage.  That is, the stored row and column indices can begin at
3739   either one (as in Fortran) or zero.
3740 
3741   Specify the preallocated storage with either nz or nnz (not both).
3742   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3743   allocation.
3744 
3745 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3746 @*/
3747 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3748 {
3749   PetscFunctionBegin;
3750   PetscCall(MatCreate(comm, A));
3751   PetscCall(MatSetSizes(*A, m, n, m, n));
3752   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3753   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3754   PetscFunctionReturn(PETSC_SUCCESS);
3755 }
3756 
3757 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3758 {
3759   PetscFunctionBegin;
3760   if (A->factortype == MAT_FACTOR_NONE) {
3761     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3762   } else {
3763     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3764   }
3765   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3766   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3767   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3768   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3769   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3770   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3771   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3772   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3773   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3774   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3775   PetscCall(MatDestroy_SeqAIJ(A));
3776   PetscFunctionReturn(PETSC_SUCCESS);
3777 }
3778 
3779 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3780 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3781 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3782 {
3783   PetscFunctionBegin;
3784   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3785   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3786   PetscFunctionReturn(PETSC_SUCCESS);
3787 }
3788 
3789 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3790 {
3791   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3792   Mat_SeqAIJCUSPARSE *cy;
3793   Mat_SeqAIJCUSPARSE *cx;
3794   PetscScalar        *ay;
3795   const PetscScalar  *ax;
3796   CsrMatrix          *csry, *csrx;
3797 
3798   PetscFunctionBegin;
3799   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3800   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3801   if (X->ops->axpy != Y->ops->axpy) {
3802     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3803     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3804     PetscFunctionReturn(PETSC_SUCCESS);
3805   }
3806   /* if we are here, it means both matrices are bound to GPU */
3807   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3808   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3809   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3810   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3811   csry = (CsrMatrix *)cy->mat->mat;
3812   csrx = (CsrMatrix *)cx->mat->mat;
3813   /* see if we can turn this into a cublas axpy */
3814   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3815     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3816     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3817     if (eq) str = SAME_NONZERO_PATTERN;
3818   }
3819   /* spgeam is buggy with one column */
3820   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3821 
3822   if (str == SUBSET_NONZERO_PATTERN) {
3823     PetscScalar b = 1.0;
3824 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3825     size_t bufferSize;
3826     void  *buffer;
3827 #endif
3828 
3829     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3830     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3831     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3832 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3833     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3834                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3835     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3836     PetscCall(PetscLogGpuTimeBegin());
3837     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3838                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3839     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3840     PetscCall(PetscLogGpuTimeEnd());
3841     PetscCallCUDA(cudaFree(buffer));
3842 #else
3843     PetscCall(PetscLogGpuTimeBegin());
3844     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3845                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3846     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3847     PetscCall(PetscLogGpuTimeEnd());
3848 #endif
3849     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3850     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3851     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3852     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3853   } else if (str == SAME_NONZERO_PATTERN) {
3854     cublasHandle_t cublasv2handle;
3855     PetscBLASInt   one = 1, bnz = 1;
3856 
3857     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3858     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3859     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3860     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3861     PetscCall(PetscLogGpuTimeBegin());
3862     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3863     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3864     PetscCall(PetscLogGpuTimeEnd());
3865     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3866     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3867     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3868   } else {
3869     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3870     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3871   }
3872   PetscFunctionReturn(PETSC_SUCCESS);
3873 }
3874 
3875 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3876 {
3877   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3878   PetscScalar   *ay;
3879   cublasHandle_t cublasv2handle;
3880   PetscBLASInt   one = 1, bnz = 1;
3881 
3882   PetscFunctionBegin;
3883   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3884   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3885   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3886   PetscCall(PetscLogGpuTimeBegin());
3887   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3888   PetscCall(PetscLogGpuFlops(bnz));
3889   PetscCall(PetscLogGpuTimeEnd());
3890   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3891   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3892   PetscFunctionReturn(PETSC_SUCCESS);
3893 }
3894 
3895 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3896 {
3897   PetscBool   both = PETSC_FALSE;
3898   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3899 
3900   PetscFunctionBegin;
3901   if (A->factortype == MAT_FACTOR_NONE) {
3902     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3903     if (spptr->mat) {
3904       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3905       if (matrix->values) {
3906         both = PETSC_TRUE;
3907         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3908       }
3909     }
3910     if (spptr->matTranspose) {
3911       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3912       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3913     }
3914   }
3915   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3916   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3917   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3918   else A->offloadmask = PETSC_OFFLOAD_CPU;
3919   PetscFunctionReturn(PETSC_SUCCESS);
3920 }
3921 
3922 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3923 {
3924   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3925 
3926   PetscFunctionBegin;
3927   if (A->factortype != MAT_FACTOR_NONE) {
3928     A->boundtocpu = flg;
3929     PetscFunctionReturn(PETSC_SUCCESS);
3930   }
3931   if (flg) {
3932     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3933 
3934     A->ops->scale                     = MatScale_SeqAIJ;
3935     A->ops->axpy                      = MatAXPY_SeqAIJ;
3936     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3937     A->ops->mult                      = MatMult_SeqAIJ;
3938     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3939     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3940     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3941     A->ops->multhermitiantranspose    = NULL;
3942     A->ops->multhermitiantransposeadd = NULL;
3943     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3944     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3945     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3946     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3947     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3948     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3949     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3950     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3951   } else {
3952     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3953     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3954     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3955     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3956     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3957     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3958     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3959     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3960     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3961     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3962     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3963     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3964     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3965     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3966     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3967     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3968     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3969 
3970     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3971     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3972     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3973     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3974     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3975     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3976   }
3977   A->boundtocpu = flg;
3978   if (flg && a->inode.size) {
3979     a->inode.use = PETSC_TRUE;
3980   } else {
3981     a->inode.use = PETSC_FALSE;
3982   }
3983   PetscFunctionReturn(PETSC_SUCCESS);
3984 }
3985 
3986 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3987 {
3988   Mat B;
3989 
3990   PetscFunctionBegin;
3991   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3992   if (reuse == MAT_INITIAL_MATRIX) {
3993     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3994   } else if (reuse == MAT_REUSE_MATRIX) {
3995     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3996   }
3997   B = *newmat;
3998 
3999   PetscCall(PetscFree(B->defaultvectype));
4000   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4001 
4002   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4003     if (B->factortype == MAT_FACTOR_NONE) {
4004       Mat_SeqAIJCUSPARSE *spptr;
4005       PetscCall(PetscNew(&spptr));
4006       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4007       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4008       spptr->format = MAT_CUSPARSE_CSR;
4009 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4010   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4011       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4012   #else
4013       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4014   #endif
4015       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4016       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4017 #endif
4018       B->spptr = spptr;
4019     } else {
4020       Mat_SeqAIJCUSPARSETriFactors *spptr;
4021 
4022       PetscCall(PetscNew(&spptr));
4023       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4024       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4025       B->spptr = spptr;
4026     }
4027     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4028   }
4029   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4030   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4031   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4032   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4033   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4034   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4035 
4036   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4037   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4038   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4039 #if defined(PETSC_HAVE_HYPRE)
4040   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4041 #endif
4042   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4043   PetscFunctionReturn(PETSC_SUCCESS);
4044 }
4045 
4046 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4047 {
4048   PetscFunctionBegin;
4049   PetscCall(MatCreate_SeqAIJ(B));
4050   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4051   PetscFunctionReturn(PETSC_SUCCESS);
4052 }
4053 
4054 /*MC
4055    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4056 
4057    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4058    CSR, ELL, or Hybrid format.
4059    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4060 
4061    Options Database Keys:
4062 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4063 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4064                                       Other options include ell (ellpack) or hyb (hybrid).
4065 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4066 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4067 
4068   Level: beginner
4069 
4070 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4071 M*/
4072 
4073 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4074 {
4075   PetscFunctionBegin;
4076   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4077   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4078   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4079   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4080 
4081   PetscFunctionReturn(PETSC_SUCCESS);
4082 }
4083 
4084 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4085 {
4086   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4087 
4088   PetscFunctionBegin;
4089   if (cusp) {
4090     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4091     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4092     delete cusp->workVector;
4093     delete cusp->rowoffsets_gpu;
4094     delete cusp->csr2csc_i;
4095     delete cusp->coords;
4096     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4097     PetscCall(PetscFree(mat->spptr));
4098   }
4099   PetscFunctionReturn(PETSC_SUCCESS);
4100 }
4101 
4102 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4103 {
4104   PetscFunctionBegin;
4105   if (*mat) {
4106     delete (*mat)->values;
4107     delete (*mat)->column_indices;
4108     delete (*mat)->row_offsets;
4109     delete *mat;
4110     *mat = 0;
4111   }
4112   PetscFunctionReturn(PETSC_SUCCESS);
4113 }
4114 
4115 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4116 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4117 {
4118   PetscFunctionBegin;
4119   if (*trifactor) {
4120     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4121     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4122     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4123     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4124     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4125   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4126     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4127   #endif
4128     PetscCall(PetscFree(*trifactor));
4129   }
4130   PetscFunctionReturn(PETSC_SUCCESS);
4131 }
4132 #endif
4133 
4134 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4135 {
4136   CsrMatrix *mat;
4137 
4138   PetscFunctionBegin;
4139   if (*matstruct) {
4140     if ((*matstruct)->mat) {
4141       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4142 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4143         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4144 #else
4145         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4146         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4147 #endif
4148       } else {
4149         mat = (CsrMatrix *)(*matstruct)->mat;
4150         PetscCall(CsrMatrix_Destroy(&mat));
4151       }
4152     }
4153     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4154     delete (*matstruct)->cprowIndices;
4155     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4156     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4157     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4158 
4159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4160     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4161     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4162     for (int i = 0; i < 3; i++) {
4163       if (mdata->cuSpMV[i].initialized) {
4164         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4165         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4166         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4167       }
4168     }
4169 #endif
4170     delete *matstruct;
4171     *matstruct = NULL;
4172   }
4173   PetscFunctionReturn(PETSC_SUCCESS);
4174 }
4175 
4176 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4177 {
4178   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4179 
4180   PetscFunctionBegin;
4181   if (fs) {
4182 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4183     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4184     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4185     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4186     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4187     delete fs->workVector;
4188     fs->workVector = NULL;
4189 #endif
4190     delete fs->rpermIndices;
4191     delete fs->cpermIndices;
4192     fs->rpermIndices  = NULL;
4193     fs->cpermIndices  = NULL;
4194     fs->init_dev_prop = PETSC_FALSE;
4195 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4196     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4197     PetscCallCUDA(cudaFree(fs->csrColIdx));
4198     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4199     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4200     PetscCallCUDA(cudaFree(fs->csrVal));
4201     PetscCallCUDA(cudaFree(fs->diag));
4202     PetscCallCUDA(cudaFree(fs->X));
4203     PetscCallCUDA(cudaFree(fs->Y));
4204     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4205     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4206     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4207     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4208     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4209     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4210     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4211     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4212     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4213     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4214     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4215     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4216     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4217     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4218     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4219     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4220     PetscCall(PetscFree(fs->csrRowPtr_h));
4221     PetscCall(PetscFree(fs->csrVal_h));
4222     PetscCall(PetscFree(fs->diag_h));
4223     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4224     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4225 #endif
4226   }
4227   PetscFunctionReturn(PETSC_SUCCESS);
4228 }
4229 
4230 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4231 {
4232   PetscFunctionBegin;
4233   if (*trifactors) {
4234     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4235     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4236     PetscCall(PetscFree(*trifactors));
4237   }
4238   PetscFunctionReturn(PETSC_SUCCESS);
4239 }
4240 
4241 struct IJCompare {
4242   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4243   {
4244     if (t1.get<0>() < t2.get<0>()) return true;
4245     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4246     return false;
4247   }
4248 };
4249 
4250 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4251 {
4252   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4253 
4254   PetscFunctionBegin;
4255   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4256   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4257   if (destroy) {
4258     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4259     delete cusp->csr2csc_i;
4260     cusp->csr2csc_i = NULL;
4261   }
4262   A->transupdated = PETSC_FALSE;
4263   PetscFunctionReturn(PETSC_SUCCESS);
4264 }
4265 
4266 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4267 {
4268   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
4269   PetscFunctionBegin;
4270   PetscCallCUDA(cudaFree(coo->perm));
4271   PetscCallCUDA(cudaFree(coo->jmap));
4272   PetscCall(PetscFree(coo));
4273   PetscFunctionReturn(PETSC_SUCCESS);
4274 }
4275 
4276 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4277 {
4278   PetscBool            dev_ij = PETSC_FALSE;
4279   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4280   PetscInt            *i, *j;
4281   PetscContainer       container_h, container_d;
4282   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4283 
4284   PetscFunctionBegin;
4285   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4286   PetscCall(PetscGetMemType(coo_i, &mtype));
4287   if (PetscMemTypeDevice(mtype)) {
4288     dev_ij = PETSC_TRUE;
4289     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4290     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4291     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4292   } else {
4293     i = coo_i;
4294     j = coo_j;
4295   }
4296 
4297   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4298   if (dev_ij) PetscCall(PetscFree2(i, j));
4299   mat->offloadmask = PETSC_OFFLOAD_CPU;
4300   // Create the GPU memory
4301   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4302 
4303   // Copy the COO struct to device
4304   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4305   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4306   PetscCall(PetscMalloc1(1, &coo_d));
4307   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4308   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4309   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4310   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4311   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4312 
4313   // Put the COO struct in a container and then attach that to the matrix
4314   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
4315   PetscCall(PetscContainerSetPointer(container_d, coo_d));
4316   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4317   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
4318   PetscCall(PetscContainerDestroy(&container_d));
4319   PetscFunctionReturn(PETSC_SUCCESS);
4320 }
4321 
4322 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4323 {
4324   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4325   const PetscCount grid_size = gridDim.x * blockDim.x;
4326   for (; i < nnz; i += grid_size) {
4327     PetscScalar sum = 0.0;
4328     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4329     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4330   }
4331 }
4332 
4333 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4334 {
4335   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4336   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4337   PetscCount           Annz = seq->nz;
4338   PetscMemType         memtype;
4339   const PetscScalar   *v1 = v;
4340   PetscScalar         *Aa;
4341   PetscContainer       container;
4342   MatCOOStruct_SeqAIJ *coo;
4343 
4344   PetscFunctionBegin;
4345   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4346 
4347   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4348   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4349 
4350   PetscCall(PetscGetMemType(v, &memtype));
4351   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4352     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4353     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4354   }
4355 
4356   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4357   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4358 
4359   PetscCall(PetscLogGpuTimeBegin());
4360   if (Annz) {
4361     MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4362     PetscCallCUDA(cudaPeekAtLastError());
4363   }
4364   PetscCall(PetscLogGpuTimeEnd());
4365 
4366   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4367   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4368 
4369   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4370   PetscFunctionReturn(PETSC_SUCCESS);
4371 }
4372 
4373 /*@C
4374   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4375 
4376   Not Collective
4377 
4378   Input Parameters:
4379 + A          - the matrix
4380 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4381 
4382   Output Parameters:
4383 + i - the CSR row pointers
4384 - j - the CSR column indices
4385 
4386   Level: developer
4387 
4388   Note:
4389   When compressed is true, the CSR structure does not contain empty rows
4390 
4391 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4392 @*/
4393 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4394 {
4395   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4396   CsrMatrix          *csr;
4397   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4398 
4399   PetscFunctionBegin;
4400   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4401   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4402   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4403   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4404   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4405   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4406   csr = (CsrMatrix *)cusp->mat->mat;
4407   if (i) {
4408     if (!compressed && a->compressedrow.use) { /* need full row offset */
4409       if (!cusp->rowoffsets_gpu) {
4410         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4411         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4412         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4413       }
4414       *i = cusp->rowoffsets_gpu->data().get();
4415     } else *i = csr->row_offsets->data().get();
4416   }
4417   if (j) *j = csr->column_indices->data().get();
4418   PetscFunctionReturn(PETSC_SUCCESS);
4419 }
4420 
4421 /*@C
4422   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4423 
4424   Not Collective
4425 
4426   Input Parameters:
4427 + A          - the matrix
4428 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4429 . i          - the CSR row pointers
4430 - j          - the CSR column indices
4431 
4432   Level: developer
4433 
4434 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4435 @*/
4436 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4437 {
4438   PetscFunctionBegin;
4439   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4440   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4441   if (i) *i = NULL;
4442   if (j) *j = NULL;
4443   (void)compressed;
4444   PetscFunctionReturn(PETSC_SUCCESS);
4445 }
4446 
4447 /*@C
4448   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4449 
4450   Not Collective
4451 
4452   Input Parameter:
4453 . A - a `MATSEQAIJCUSPARSE` matrix
4454 
4455   Output Parameter:
4456 . a - pointer to the device data
4457 
4458   Level: developer
4459 
4460   Note:
4461   May trigger host-device copies if up-to-date matrix data is on host
4462 
4463 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4464 @*/
4465 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4466 {
4467   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4468   CsrMatrix          *csr;
4469 
4470   PetscFunctionBegin;
4471   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4472   PetscAssertPointer(a, 2);
4473   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4474   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4475   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4476   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4477   csr = (CsrMatrix *)cusp->mat->mat;
4478   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4479   *a = csr->values->data().get();
4480   PetscFunctionReturn(PETSC_SUCCESS);
4481 }
4482 
4483 /*@C
4484   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4485 
4486   Not Collective
4487 
4488   Input Parameters:
4489 + A - a `MATSEQAIJCUSPARSE` matrix
4490 - a - pointer to the device data
4491 
4492   Level: developer
4493 
4494 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4495 @*/
4496 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4497 {
4498   PetscFunctionBegin;
4499   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4500   PetscAssertPointer(a, 2);
4501   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4502   *a = NULL;
4503   PetscFunctionReturn(PETSC_SUCCESS);
4504 }
4505 
4506 /*@C
4507   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4508 
4509   Not Collective
4510 
4511   Input Parameter:
4512 . A - a `MATSEQAIJCUSPARSE` matrix
4513 
4514   Output Parameter:
4515 . a - pointer to the device data
4516 
4517   Level: developer
4518 
4519   Note:
4520   May trigger host-device copies if up-to-date matrix data is on host
4521 
4522 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4523 @*/
4524 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4525 {
4526   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4527   CsrMatrix          *csr;
4528 
4529   PetscFunctionBegin;
4530   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4531   PetscAssertPointer(a, 2);
4532   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4533   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4534   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4535   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4536   csr = (CsrMatrix *)cusp->mat->mat;
4537   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4538   *a             = csr->values->data().get();
4539   A->offloadmask = PETSC_OFFLOAD_GPU;
4540   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4541   PetscFunctionReturn(PETSC_SUCCESS);
4542 }
4543 /*@C
4544   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4545 
4546   Not Collective
4547 
4548   Input Parameters:
4549 + A - a `MATSEQAIJCUSPARSE` matrix
4550 - a - pointer to the device data
4551 
4552   Level: developer
4553 
4554 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4555 @*/
4556 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4557 {
4558   PetscFunctionBegin;
4559   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4560   PetscAssertPointer(a, 2);
4561   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4562   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4563   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4564   *a = NULL;
4565   PetscFunctionReturn(PETSC_SUCCESS);
4566 }
4567 
4568 /*@C
4569   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4570 
4571   Not Collective
4572 
4573   Input Parameter:
4574 . A - a `MATSEQAIJCUSPARSE` matrix
4575 
4576   Output Parameter:
4577 . a - pointer to the device data
4578 
4579   Level: developer
4580 
4581   Note:
4582   Does not trigger host-device copies and flags data validity on the GPU
4583 
4584 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4585 @*/
4586 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4587 {
4588   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4589   CsrMatrix          *csr;
4590 
4591   PetscFunctionBegin;
4592   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4593   PetscAssertPointer(a, 2);
4594   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4595   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4596   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4597   csr = (CsrMatrix *)cusp->mat->mat;
4598   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4599   *a             = csr->values->data().get();
4600   A->offloadmask = PETSC_OFFLOAD_GPU;
4601   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4602   PetscFunctionReturn(PETSC_SUCCESS);
4603 }
4604 
4605 /*@C
4606   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4607 
4608   Not Collective
4609 
4610   Input Parameters:
4611 + A - a `MATSEQAIJCUSPARSE` matrix
4612 - a - pointer to the device data
4613 
4614   Level: developer
4615 
4616 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4617 @*/
4618 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4619 {
4620   PetscFunctionBegin;
4621   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4622   PetscAssertPointer(a, 2);
4623   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4625   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4626   *a = NULL;
4627   PetscFunctionReturn(PETSC_SUCCESS);
4628 }
4629 
4630 struct IJCompare4 {
4631   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4632   {
4633     if (t1.get<0>() < t2.get<0>()) return true;
4634     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4635     return false;
4636   }
4637 };
4638 
4639 struct Shift {
4640   int _shift;
4641 
4642   Shift(int shift) : _shift(shift) { }
4643   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4644 };
4645 
4646 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4647 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4648 {
4649   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4650   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4651   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4652   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4653   PetscInt                      Annz, Bnnz;
4654   cusparseStatus_t              stat;
4655   PetscInt                      i, m, n, zero = 0;
4656 
4657   PetscFunctionBegin;
4658   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4659   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4660   PetscAssertPointer(C, 4);
4661   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4662   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4663   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4664   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4665   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4666   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4667   if (reuse == MAT_INITIAL_MATRIX) {
4668     m = A->rmap->n;
4669     n = A->cmap->n + B->cmap->n;
4670     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4671     PetscCall(MatSetSizes(*C, m, n, m, n));
4672     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4673     c                       = (Mat_SeqAIJ *)(*C)->data;
4674     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4675     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4676     Ccsr                    = new CsrMatrix;
4677     Cmat->cprowIndices      = NULL;
4678     c->compressedrow.use    = PETSC_FALSE;
4679     c->compressedrow.nrows  = 0;
4680     c->compressedrow.i      = NULL;
4681     c->compressedrow.rindex = NULL;
4682     Ccusp->workVector       = NULL;
4683     Ccusp->nrows            = m;
4684     Ccusp->mat              = Cmat;
4685     Ccusp->mat->mat         = Ccsr;
4686     Ccsr->num_rows          = m;
4687     Ccsr->num_cols          = n;
4688     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4689     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4690     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4691     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4692     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4693     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4694     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4695     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4696     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4697     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4698     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4699     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4700     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4701 
4702     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4703     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4704     Annz                 = (PetscInt)Acsr->column_indices->size();
4705     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4706     c->nz                = Annz + Bnnz;
4707     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4708     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4709     Ccsr->values         = new THRUSTARRAY(c->nz);
4710     Ccsr->num_entries    = c->nz;
4711     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4712     if (c->nz) {
4713       auto              Acoo = new THRUSTINTARRAY32(Annz);
4714       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4715       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4716       THRUSTINTARRAY32 *Aroff, *Broff;
4717 
4718       if (a->compressedrow.use) { /* need full row offset */
4719         if (!Acusp->rowoffsets_gpu) {
4720           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4721           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4722           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4723         }
4724         Aroff = Acusp->rowoffsets_gpu;
4725       } else Aroff = Acsr->row_offsets;
4726       if (b->compressedrow.use) { /* need full row offset */
4727         if (!Bcusp->rowoffsets_gpu) {
4728           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4729           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4730           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4731         }
4732         Broff = Bcusp->rowoffsets_gpu;
4733       } else Broff = Bcsr->row_offsets;
4734       PetscCall(PetscLogGpuTimeBegin());
4735       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4736       PetscCallCUSPARSE(stat);
4737       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4738       PetscCallCUSPARSE(stat);
4739       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4740       auto Aperm = thrust::make_constant_iterator(1);
4741       auto Bperm = thrust::make_constant_iterator(0);
4742 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4743       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4744       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4745 #else
4746       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4747       auto Bcib = Bcsr->column_indices->begin();
4748       auto Bcie = Bcsr->column_indices->end();
4749       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4750 #endif
4751       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4752       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4753       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4754       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4755       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4756       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4757       auto p1    = Ccusp->coords->begin();
4758       auto p2    = Ccusp->coords->begin();
4759       thrust::advance(p2, Annz);
4760       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4761 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4762       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4763 #endif
4764       auto cci = thrust::make_counting_iterator(zero);
4765       auto cce = thrust::make_counting_iterator(c->nz);
4766 #if 0 //Errors on SUMMIT cuda 11.1.0
4767       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4768 #else
4769       auto pred = thrust::identity<int>();
4770       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4771       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4772 #endif
4773       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4774       PetscCallCUSPARSE(stat);
4775       PetscCall(PetscLogGpuTimeEnd());
4776       delete wPerm;
4777       delete Acoo;
4778       delete Bcoo;
4779       delete Ccoo;
4780 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4781       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4782       PetscCallCUSPARSE(stat);
4783 #endif
4784       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4785         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4786         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4787         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4788         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4789         CsrMatrix                    *CcsrT = new CsrMatrix;
4790         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4791         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4792 
4793         (*C)->form_explicit_transpose = PETSC_TRUE;
4794         (*C)->transupdated            = PETSC_TRUE;
4795         Ccusp->rowoffsets_gpu         = NULL;
4796         CmatT->cprowIndices           = NULL;
4797         CmatT->mat                    = CcsrT;
4798         CcsrT->num_rows               = n;
4799         CcsrT->num_cols               = m;
4800         CcsrT->num_entries            = c->nz;
4801 
4802         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4803         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4804         CcsrT->values         = new THRUSTARRAY(c->nz);
4805 
4806         PetscCall(PetscLogGpuTimeBegin());
4807         auto rT = CcsrT->row_offsets->begin();
4808         if (AT) {
4809           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4810           thrust::advance(rT, -1);
4811         }
4812         if (BT) {
4813           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4814           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4815           thrust::copy(titb, tite, rT);
4816         }
4817         auto cT = CcsrT->column_indices->begin();
4818         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4819         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4820         auto vT = CcsrT->values->begin();
4821         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4822         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4823         PetscCall(PetscLogGpuTimeEnd());
4824 
4825         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4826         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4827         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4828         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4829         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4830         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4831         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4832         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4833         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4834 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4835         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4836         PetscCallCUSPARSE(stat);
4837 #endif
4838         Ccusp->matTranspose = CmatT;
4839       }
4840     }
4841 
4842     c->singlemalloc = PETSC_FALSE;
4843     c->free_a       = PETSC_TRUE;
4844     c->free_ij      = PETSC_TRUE;
4845     PetscCall(PetscMalloc1(m + 1, &c->i));
4846     PetscCall(PetscMalloc1(c->nz, &c->j));
4847     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4848       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4849       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4850       ii = *Ccsr->row_offsets;
4851       jj = *Ccsr->column_indices;
4852       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4853       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4854     } else {
4855       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4856       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4857     }
4858     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4859     PetscCall(PetscMalloc1(m, &c->ilen));
4860     PetscCall(PetscMalloc1(m, &c->imax));
4861     c->maxnz         = c->nz;
4862     c->nonzerorowcnt = 0;
4863     c->rmax          = 0;
4864     for (i = 0; i < m; i++) {
4865       const PetscInt nn = c->i[i + 1] - c->i[i];
4866       c->ilen[i] = c->imax[i] = nn;
4867       c->nonzerorowcnt += (PetscInt) !!nn;
4868       c->rmax = PetscMax(c->rmax, nn);
4869     }
4870     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4871     PetscCall(PetscMalloc1(c->nz, &c->a));
4872     (*C)->nonzerostate++;
4873     PetscCall(PetscLayoutSetUp((*C)->rmap));
4874     PetscCall(PetscLayoutSetUp((*C)->cmap));
4875     Ccusp->nonzerostate = (*C)->nonzerostate;
4876     (*C)->preallocated  = PETSC_TRUE;
4877   } else {
4878     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4879     c = (Mat_SeqAIJ *)(*C)->data;
4880     if (c->nz) {
4881       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4882       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4883       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4884       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4885       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4886       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4887       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4888       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4889       Acsr = (CsrMatrix *)Acusp->mat->mat;
4890       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4891       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4892       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4893       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4894       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4895       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4896       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4897       auto pmid = Ccusp->coords->begin();
4898       thrust::advance(pmid, Acsr->num_entries);
4899       PetscCall(PetscLogGpuTimeBegin());
4900       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4901       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4902       thrust::for_each(zibait, zieait, VecCUDAEquals());
4903       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4904       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4905       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4906       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4907       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4908         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4909         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4910         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4911         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4912         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4913         auto       vT    = CcsrT->values->begin();
4914         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4915         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4916         (*C)->transupdated = PETSC_TRUE;
4917       }
4918       PetscCall(PetscLogGpuTimeEnd());
4919     }
4920   }
4921   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4922   (*C)->assembled     = PETSC_TRUE;
4923   (*C)->was_assembled = PETSC_FALSE;
4924   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4925   PetscFunctionReturn(PETSC_SUCCESS);
4926 }
4927 
4928 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4929 {
4930   bool               dmem;
4931   const PetscScalar *av;
4932 
4933   PetscFunctionBegin;
4934   dmem = isCudaMem(v);
4935   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4936   if (n && idx) {
4937     THRUSTINTARRAY widx(n);
4938     widx.assign(idx, idx + n);
4939     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4940 
4941     THRUSTARRAY                    *w = NULL;
4942     thrust::device_ptr<PetscScalar> dv;
4943     if (dmem) {
4944       dv = thrust::device_pointer_cast(v);
4945     } else {
4946       w  = new THRUSTARRAY(n);
4947       dv = w->data();
4948     }
4949     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4950 
4951     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4952     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4953     thrust::for_each(zibit, zieit, VecCUDAEquals());
4954     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4955     delete w;
4956   } else {
4957     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4958   }
4959   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4960   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4961   PetscFunctionReturn(PETSC_SUCCESS);
4962 }
4963