xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 77df3101a0fd56376d67ff332bf6d7af1d692ea5)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24   #include <cuda/std/functional>
25 #endif
26 
27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29 /*
30   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32 */
33 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36 #endif
37 
38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48 #endif
49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
59 
60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
64 
65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
67 
68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71 
72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
75 
76   PetscFunctionBegin;
77   switch (op) {
78   case MAT_CUSPARSE_MULT:
79     cusparsestruct->format = format;
80     break;
81   case MAT_CUSPARSE_ALL:
82     cusparsestruct->format = format;
83     break;
84   default:
85     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86   }
87   PetscFunctionReturn(PETSC_SUCCESS);
88 }
89 
90 /*@
91   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92   operation. Only the `MatMult()` operation can use different GPU storage formats
93 
94   Not Collective
95 
96   Input Parameters:
97 + A      - Matrix of type `MATSEQAIJCUSPARSE`
98 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101 
102   Level: intermediate
103 
104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105 @*/
106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107 {
108   PetscFunctionBegin;
109   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117 
118   PetscFunctionBegin;
119   cusparsestruct->use_cpu_solve = use_cpu;
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /*@
124   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125 
126   Input Parameters:
127 + A       - Matrix of type `MATSEQAIJCUSPARSE`
128 - use_cpu - set flag for using the built-in CPU `MatSolve()`
129 
130   Level: intermediate
131 
132   Note:
133   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136 
137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138 @*/
139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140 {
141   PetscFunctionBegin;
142   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148 {
149   PetscFunctionBegin;
150   switch (op) {
151   case MAT_FORM_EXPLICIT_TRANSPOSE:
152     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154     A->form_explicit_transpose = flg;
155     break;
156   default:
157     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158     break;
159   }
160   PetscFunctionReturn(PETSC_SUCCESS);
161 }
162 
163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164 {
165   MatCUSPARSEStorageFormat format;
166   PetscBool                flg;
167   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
168 
169   PetscFunctionBegin;
170   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171   if (A->factortype == MAT_FACTOR_NONE) {
172     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174 
175     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184   #else
185     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186   #endif
187     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189 
190     PetscCall(
191       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193 #endif
194   }
195   PetscOptionsHeadEnd();
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201 {
202   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203   PetscInt                      m  = A->rmap->n;
204   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
206   const MatScalar              *Aa = a->a;
207   PetscInt                     *Mi, *Mj, Mnz;
208   PetscScalar                  *Ma;
209 
210   PetscFunctionBegin;
211   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
212     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
213       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
214       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
215       PetscCall(PetscMalloc1(m + 1, &Mi));
216       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
217       PetscCall(PetscMalloc1(Mnz, &Ma));
218       Mi[0] = 0;
219       for (PetscInt i = 0; i < m; i++) {
220         PetscInt llen = Ai[i + 1] - Ai[i];
221         PetscInt ulen = Adiag[i] - Adiag[i + 1];
222         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
223         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
224         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
225         Mi[i + 1] = Mi[i] + llen + ulen;
226       }
227       // Copy M (L,U) from host to device
228       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
229       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
230       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
231       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
232       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
233 
234       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
235       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
236       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
237       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
238       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
239       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
240       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
241       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
242 
243       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
244       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
245       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
246 
247       fillMode = CUSPARSE_FILL_MODE_UPPER;
248       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
249       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
250       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
251       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
252 
253       // Allocate work vectors in SpSv
254       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
255       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
256 
257       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
258       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
259 
260       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
261       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
262       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
263       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
264       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
265       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
266       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
267 
268       // Record for reuse
269       fs->csrRowPtr_h = Mi;
270       fs->csrVal_h    = Ma;
271       PetscCall(PetscFree(Mj));
272     }
273     // Copy the value
274     Mi  = fs->csrRowPtr_h;
275     Ma  = fs->csrVal_h;
276     Mnz = Mi[m];
277     for (PetscInt i = 0; i < m; i++) {
278       PetscInt llen = Ai[i + 1] - Ai[i];
279       PetscInt ulen = Adiag[i] - Adiag[i + 1];
280       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
281       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
282       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
283     }
284     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
285 
286   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
287     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
288       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
289       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
290       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291     } else
292   #endif
293     {
294       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
295       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
296 
297       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
298       fs->updatedSpSVAnalysis          = PETSC_TRUE;
299       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
300     }
301   }
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 #else
305 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
306 {
307   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
308   PetscInt                           n                  = A->rmap->n;
309   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
310   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
311   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
312   const MatScalar                   *aa = a->a, *v;
313   PetscInt                          *AiLo, *AjLo;
314   PetscInt                           i, nz, nzLower, offset, rowOffset;
315 
316   PetscFunctionBegin;
317   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
318   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
319     try {
320       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
321       nzLower = n + ai[n] - ai[1];
322       if (!loTriFactor) {
323         PetscScalar *AALo;
324 
325         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
326 
327         /* Allocate Space for the lower triangular matrix */
328         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
329         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
330 
331         /* Fill the lower triangular matrix */
332         AiLo[0]   = (PetscInt)0;
333         AiLo[n]   = nzLower;
334         AjLo[0]   = (PetscInt)0;
335         AALo[0]   = (MatScalar)1.0;
336         v         = aa;
337         vi        = aj;
338         offset    = 1;
339         rowOffset = 1;
340         for (i = 1; i < n; i++) {
341           nz = ai[i + 1] - ai[i];
342           /* additional 1 for the term on the diagonal */
343           AiLo[i] = rowOffset;
344           rowOffset += nz + 1;
345 
346           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
347           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
348 
349           offset += nz;
350           AjLo[offset] = (PetscInt)i;
351           AALo[offset] = (MatScalar)1.0;
352           offset += 1;
353 
354           v += nz;
355           vi += nz;
356         }
357 
358         /* allocate space for the triangular factor information */
359         PetscCall(PetscNew(&loTriFactor));
360         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
361         /* Create the matrix description */
362         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
363         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
364   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
365         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
366   #else
367         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
368   #endif
369         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
370         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
371 
372         /* set the operation */
373         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
374 
375         /* set the matrix */
376         loTriFactor->csrMat              = new CsrMatrix;
377         loTriFactor->csrMat->num_rows    = n;
378         loTriFactor->csrMat->num_cols    = n;
379         loTriFactor->csrMat->num_entries = nzLower;
380 
381         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
382         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
383 
384         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
385         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
386 
387         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
388         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
389 
390         /* Create the solve analysis information */
391         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
392         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
393   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
394         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
395                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
396         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
397   #endif
398 
399         /* perform the solve analysis */
400         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
401                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
402         PetscCallCUDA(WaitForCUDA());
403         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
404 
405         /* assign the pointer */
406         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
407         loTriFactor->AA_h                                          = AALo;
408         PetscCallCUDA(cudaFreeHost(AiLo));
409         PetscCallCUDA(cudaFreeHost(AjLo));
410         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
411       } else { /* update values only */
412         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
413         /* Fill the lower triangular matrix */
414         loTriFactor->AA_h[0] = 1.0;
415         v                    = aa;
416         vi                   = aj;
417         offset               = 1;
418         for (i = 1; i < n; i++) {
419           nz = ai[i + 1] - ai[i];
420           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
421           offset += nz;
422           loTriFactor->AA_h[offset] = 1.0;
423           offset += 1;
424           v += nz;
425         }
426         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
427         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
428       }
429     } catch (char *ex) {
430       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
431     }
432   }
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
437 {
438   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
439   PetscInt                           n                  = A->rmap->n;
440   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
441   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
442   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
443   const MatScalar                   *aa = a->a, *v;
444   PetscInt                          *AiUp, *AjUp;
445   PetscInt                           i, nz, nzUpper, offset;
446 
447   PetscFunctionBegin;
448   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
449   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
450     try {
451       /* next, figure out the number of nonzeros in the upper triangular matrix. */
452       nzUpper = adiag[0] - adiag[n];
453       if (!upTriFactor) {
454         PetscScalar *AAUp;
455 
456         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
457 
458         /* Allocate Space for the upper triangular matrix */
459         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
460         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
461 
462         /* Fill the upper triangular matrix */
463         AiUp[0] = (PetscInt)0;
464         AiUp[n] = nzUpper;
465         offset  = nzUpper;
466         for (i = n - 1; i >= 0; i--) {
467           v  = aa + adiag[i + 1] + 1;
468           vi = aj + adiag[i + 1] + 1;
469 
470           /* number of elements NOT on the diagonal */
471           nz = adiag[i] - adiag[i + 1] - 1;
472 
473           /* decrement the offset */
474           offset -= (nz + 1);
475 
476           /* first, set the diagonal elements */
477           AjUp[offset] = (PetscInt)i;
478           AAUp[offset] = (MatScalar)1. / v[nz];
479           AiUp[i]      = AiUp[i + 1] - (nz + 1);
480 
481           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
482           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
483         }
484 
485         /* allocate space for the triangular factor information */
486         PetscCall(PetscNew(&upTriFactor));
487         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
488 
489         /* Create the matrix description */
490         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
491         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
492   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
493         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
494   #else
495         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
496   #endif
497         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
498         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
499 
500         /* set the operation */
501         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502 
503         /* set the matrix */
504         upTriFactor->csrMat              = new CsrMatrix;
505         upTriFactor->csrMat->num_rows    = n;
506         upTriFactor->csrMat->num_cols    = n;
507         upTriFactor->csrMat->num_entries = nzUpper;
508 
509         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
510         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
511 
512         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
513         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
514 
515         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
516         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
517 
518         /* Create the solve analysis information */
519         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
520         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
521   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
522         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
523                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
524         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
525   #endif
526 
527         /* perform the solve analysis */
528         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
529                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
530 
531         PetscCallCUDA(WaitForCUDA());
532         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
533 
534         /* assign the pointer */
535         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
536         upTriFactor->AA_h                                          = AAUp;
537         PetscCallCUDA(cudaFreeHost(AiUp));
538         PetscCallCUDA(cudaFreeHost(AjUp));
539         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
540       } else {
541         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
542         /* Fill the upper triangular matrix */
543         offset = nzUpper;
544         for (i = n - 1; i >= 0; i--) {
545           v = aa + adiag[i + 1] + 1;
546 
547           /* number of elements NOT on the diagonal */
548           nz = adiag[i] - adiag[i + 1] - 1;
549 
550           /* decrement the offset */
551           offset -= (nz + 1);
552 
553           /* first, set the diagonal elements */
554           upTriFactor->AA_h[offset] = 1. / v[nz];
555           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
556         }
557         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
558         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
559       }
560     } catch (char *ex) {
561       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
562     }
563   }
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 #endif
567 
568 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
569 {
570   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
571   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572   IS                            isrow = a->row, isicol = a->icol;
573   PetscBool                     row_identity, col_identity;
574   PetscInt                      n = A->rmap->n;
575 
576   PetscFunctionBegin;
577   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
578 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
579   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
580 #else
581   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
582   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
583   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
584 #endif
585 
586   cusparseTriFactors->nnz = a->nz;
587 
588   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
589   /* lower triangular indices */
590   PetscCall(ISIdentity(isrow, &row_identity));
591   if (!row_identity && !cusparseTriFactors->rpermIndices) {
592     const PetscInt *r;
593 
594     PetscCall(ISGetIndices(isrow, &r));
595     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
596     cusparseTriFactors->rpermIndices->assign(r, r + n);
597     PetscCall(ISRestoreIndices(isrow, &r));
598     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
599   }
600 
601   /* upper triangular indices */
602   PetscCall(ISIdentity(isicol, &col_identity));
603   if (!col_identity && !cusparseTriFactors->cpermIndices) {
604     const PetscInt *c;
605 
606     PetscCall(ISGetIndices(isicol, &c));
607     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
608     cusparseTriFactors->cpermIndices->assign(c, c + n);
609     PetscCall(ISRestoreIndices(isicol, &c));
610     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
611   }
612   PetscFunctionReturn(PETSC_SUCCESS);
613 }
614 
615 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
616 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
617 {
618   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
619   PetscInt                      m  = A->rmap->n;
620   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
621   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
622   const MatScalar              *Aa = a->a;
623   PetscInt                     *Mj, Mnz;
624   PetscScalar                  *Ma, *D;
625 
626   PetscFunctionBegin;
627   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
628     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
629       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
630       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
631       Mnz = Ai[m]; // Unz (with the unit diagonal)
632       PetscCall(PetscMalloc1(Mnz, &Ma));
633       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
634       PetscCall(PetscMalloc1(m, &D));    // the diagonal
635       for (PetscInt i = 0; i < m; i++) {
636         PetscInt ulen = Ai[i + 1] - Ai[i];
637         Mj[Ai[i]]     = i;                                              // diagonal entry
638         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
639       }
640       // Copy M (U) from host to device
641       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
642       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
643       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
644       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
645       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
646       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
647 
648       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
649       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
650       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
651       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
652       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
653       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
654       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
655       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
656 
657       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
658       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
659       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
660 
661       // Allocate work vectors in SpSv
662       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
663       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
664 
665       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
666       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
667 
668       // Query buffer sizes for SpSV and then allocate buffers
669       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
670       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
671       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
672 
673       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
674       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
675       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
676 
677       // Record for reuse
678       fs->csrVal_h = Ma;
679       fs->diag_h   = D;
680       PetscCall(PetscFree(Mj));
681     }
682     // Copy the value
683     Ma  = fs->csrVal_h;
684     D   = fs->diag_h;
685     Mnz = Ai[m];
686     for (PetscInt i = 0; i < m; i++) {
687       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
688       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
689       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
690     }
691     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
692     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
693 
694   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
695     if (fs->updatedSpSVAnalysis) {
696       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
697       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
698     } else
699   #endif
700     {
701       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
702       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
703       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
704       fs->updatedSpSVAnalysis = PETSC_TRUE;
705     }
706   }
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 // Solve Ut D U x = b
711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
712 {
713   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
714   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
715   const PetscScalar                    *barray;
716   PetscScalar                          *xarray;
717   thrust::device_ptr<const PetscScalar> bGPU;
718   thrust::device_ptr<PetscScalar>       xGPU;
719   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
720   PetscInt                              m   = A->rmap->n;
721 
722   PetscFunctionBegin;
723   PetscCall(PetscLogGpuTimeBegin());
724   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
725   PetscCall(VecCUDAGetArrayRead(b, &barray));
726   xGPU = thrust::device_pointer_cast(xarray);
727   bGPU = thrust::device_pointer_cast(barray);
728 
729   // Reorder b with the row permutation if needed, and wrap the result in fs->X
730   if (fs->rpermIndices) {
731     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
732     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
733   } else {
734     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
735   }
736 
737   // Solve Ut Y = X
738   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
739   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
740 
741   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
742   // It is basically a vector element-wise multiplication, but cublas does not have it!
743   #if CCCL_VERSION >= 3001000
744   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
745   #else
746   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
747   #endif
748 
749   // Solve U X = Y
750   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
751     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
752   } else {
753     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
754   }
755   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
756 
757   // Reorder X with the column permutation if needed, and put the result back to x
758   if (fs->cpermIndices) {
759     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
760                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
761   }
762 
763   PetscCall(VecCUDARestoreArrayRead(b, &barray));
764   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
765   PetscCall(PetscLogGpuTimeEnd());
766   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
767   PetscFunctionReturn(PETSC_SUCCESS);
768 }
769 #else
770 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
771 {
772   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
773   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
774   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
775   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
776   PetscInt                          *AiUp, *AjUp;
777   PetscScalar                       *AAUp;
778   PetscScalar                       *AALo;
779   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
780   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
781   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
782   const MatScalar                   *aa = b->a, *v;
783 
784   PetscFunctionBegin;
785   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
786   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
787     try {
788       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
789       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
790       if (!upTriFactor && !loTriFactor) {
791         /* Allocate Space for the upper triangular matrix */
792         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
793         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
794 
795         /* Fill the upper triangular matrix */
796         AiUp[0] = (PetscInt)0;
797         AiUp[n] = nzUpper;
798         offset  = 0;
799         for (i = 0; i < n; i++) {
800           /* set the pointers */
801           v  = aa + ai[i];
802           vj = aj + ai[i];
803           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
804 
805           /* first, set the diagonal elements */
806           AjUp[offset] = (PetscInt)i;
807           AAUp[offset] = (MatScalar)1.0 / v[nz];
808           AiUp[i]      = offset;
809           AALo[offset] = (MatScalar)1.0 / v[nz];
810 
811           offset += 1;
812           if (nz > 0) {
813             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
814             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
815             for (j = offset; j < offset + nz; j++) {
816               AAUp[j] = -AAUp[j];
817               AALo[j] = AAUp[j] / v[nz];
818             }
819             offset += nz;
820           }
821         }
822 
823         /* allocate space for the triangular factor information */
824         PetscCall(PetscNew(&upTriFactor));
825         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826 
827         /* Create the matrix description */
828         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
829         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
830   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
831         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
832   #else
833         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
834   #endif
835         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
836         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
837 
838         /* set the matrix */
839         upTriFactor->csrMat              = new CsrMatrix;
840         upTriFactor->csrMat->num_rows    = A->rmap->n;
841         upTriFactor->csrMat->num_cols    = A->cmap->n;
842         upTriFactor->csrMat->num_entries = a->nz;
843 
844         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
845         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
846 
847         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
849 
850         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
852 
853         /* set the operation */
854         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
855 
856         /* Create the solve analysis information */
857         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
858         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
859   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
860         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
861                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
862         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
863   #endif
864 
865         /* perform the solve analysis */
866         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
867                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
868 
869         PetscCallCUDA(WaitForCUDA());
870         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
871 
872         /* assign the pointer */
873         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
874 
875         /* allocate space for the triangular factor information */
876         PetscCall(PetscNew(&loTriFactor));
877         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
878 
879         /* Create the matrix description */
880         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
881         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
882   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
883         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
884   #else
885         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
886   #endif
887         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
888         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
889 
890         /* set the operation */
891         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
892 
893         /* set the matrix */
894         loTriFactor->csrMat              = new CsrMatrix;
895         loTriFactor->csrMat->num_rows    = A->rmap->n;
896         loTriFactor->csrMat->num_cols    = A->cmap->n;
897         loTriFactor->csrMat->num_entries = a->nz;
898 
899         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
900         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
901 
902         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
903         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
904 
905         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
906         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
907 
908         /* Create the solve analysis information */
909         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
910         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
911   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
912         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
913                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
914         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
915   #endif
916 
917         /* perform the solve analysis */
918         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
919                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
920 
921         PetscCallCUDA(WaitForCUDA());
922         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
923 
924         /* assign the pointer */
925         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
926 
927         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
928         PetscCallCUDA(cudaFreeHost(AiUp));
929         PetscCallCUDA(cudaFreeHost(AjUp));
930       } else {
931         /* Fill the upper triangular matrix */
932         offset = 0;
933         for (i = 0; i < n; i++) {
934           /* set the pointers */
935           v  = aa + ai[i];
936           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
937 
938           /* first, set the diagonal elements */
939           AAUp[offset] = 1.0 / v[nz];
940           AALo[offset] = 1.0 / v[nz];
941 
942           offset += 1;
943           if (nz > 0) {
944             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
945             for (j = offset; j < offset + nz; j++) {
946               AAUp[j] = -AAUp[j];
947               AALo[j] = AAUp[j] / v[nz];
948             }
949             offset += nz;
950           }
951         }
952         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
953         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
954         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
955         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
956         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
957       }
958       PetscCallCUDA(cudaFreeHost(AAUp));
959       PetscCallCUDA(cudaFreeHost(AALo));
960     } catch (char *ex) {
961       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
962     }
963   }
964   PetscFunctionReturn(PETSC_SUCCESS);
965 }
966 #endif
967 
968 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
969 {
970   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
971   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
972   IS                            ip                 = a->row;
973   PetscBool                     perm_identity;
974   PetscInt                      n = A->rmap->n;
975 
976   PetscFunctionBegin;
977   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
978 
979 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
980   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
981 #else
982   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
983   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
984 #endif
985   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
986 
987   A->offloadmask = PETSC_OFFLOAD_BOTH;
988 
989   /* lower triangular indices */
990   PetscCall(ISIdentity(ip, &perm_identity));
991   if (!perm_identity) {
992     IS              iip;
993     const PetscInt *irip, *rip;
994 
995     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
996     PetscCall(ISGetIndices(iip, &irip));
997     PetscCall(ISGetIndices(ip, &rip));
998     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
999     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1000     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1001     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1002     PetscCall(ISRestoreIndices(iip, &irip));
1003     PetscCall(ISDestroy(&iip));
1004     PetscCall(ISRestoreIndices(ip, &rip));
1005     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1006   }
1007   PetscFunctionReturn(PETSC_SUCCESS);
1008 }
1009 
1010 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1011 {
1012   PetscFunctionBegin;
1013   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1014   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1015   B->offloadmask = PETSC_OFFLOAD_CPU;
1016 
1017 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1018   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1019   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1020 #else
1021   /* determine which version of MatSolve needs to be used. */
1022   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1023   IS          ip = b->row;
1024   PetscBool   perm_identity;
1025 
1026   PetscCall(ISIdentity(ip, &perm_identity));
1027   if (perm_identity) {
1028     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1029     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1030   } else {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1033   }
1034 #endif
1035   B->ops->matsolve          = NULL;
1036   B->ops->matsolvetranspose = NULL;
1037 
1038   /* get the triangular factors */
1039   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1040   PetscFunctionReturn(PETSC_SUCCESS);
1041 }
1042 
1043 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1044 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1045 {
1046   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1047   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1048   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1051   cusparseIndexBase_t                indexBase;
1052   cusparseMatrixType_t               matrixType;
1053   cusparseFillMode_t                 fillMode;
1054   cusparseDiagType_t                 diagType;
1055 
1056   PetscFunctionBegin;
1057   /* allocate space for the transpose of the lower triangular factor */
1058   PetscCall(PetscNew(&loTriFactorT));
1059   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1060 
1061   /* set the matrix descriptors of the lower triangular factor */
1062   matrixType = cusparseGetMatType(loTriFactor->descr);
1063   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1064   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1065   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1066 
1067   /* Create the matrix description */
1068   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1069   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1070   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1071   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1072   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1073 
1074   /* set the operation */
1075   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1076 
1077   /* allocate GPU space for the CSC of the lower triangular factor*/
1078   loTriFactorT->csrMat                 = new CsrMatrix;
1079   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1080   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1081   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1082   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1083   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1084   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1085 
1086   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1087   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1088   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1089                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1091   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1092   #endif
1093 
1094   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1095   {
1096     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1097     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1098                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1099   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1100                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1101   #else
1102                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1103   #endif
1104     PetscCallCUSPARSE(stat);
1105   }
1106 
1107   PetscCallCUDA(WaitForCUDA());
1108   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1109 
1110   /* Create the solve analysis information */
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1112   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1113   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1114   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1115                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1116   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1117   #endif
1118 
1119   /* perform the solve analysis */
1120   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1121                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122 
1123   PetscCallCUDA(WaitForCUDA());
1124   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1125 
1126   /* assign the pointer */
1127   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1128 
1129   /*********************************************/
1130   /* Now the Transpose of the Upper Tri Factor */
1131   /*********************************************/
1132 
1133   /* allocate space for the transpose of the upper triangular factor */
1134   PetscCall(PetscNew(&upTriFactorT));
1135   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1136 
1137   /* set the matrix descriptors of the upper triangular factor */
1138   matrixType = cusparseGetMatType(upTriFactor->descr);
1139   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1140   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1141   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1142 
1143   /* Create the matrix description */
1144   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1145   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1146   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1147   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1148   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1149 
1150   /* set the operation */
1151   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1152 
1153   /* allocate GPU space for the CSC of the upper triangular factor*/
1154   upTriFactorT->csrMat                 = new CsrMatrix;
1155   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1156   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1157   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1158   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1159   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1160   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1161 
1162   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1163   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1164   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1165                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1166                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1167   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1168   #endif
1169 
1170   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1171   {
1172     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1173     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1174                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1175   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1176                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1177   #else
1178                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1179   #endif
1180     PetscCallCUSPARSE(stat);
1181   }
1182 
1183   PetscCallCUDA(WaitForCUDA());
1184   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1185 
1186   /* Create the solve analysis information */
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1188   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1189   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1190   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1191                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1192   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1193   #endif
1194 
1195   /* perform the solve analysis */
1196   /* christ, would it have killed you to put this stuff in a function????????? */
1197   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1198                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1199 
1200   PetscCallCUDA(WaitForCUDA());
1201   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1202 
1203   /* assign the pointer */
1204   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1205   PetscFunctionReturn(PETSC_SUCCESS);
1206 }
1207 #endif
1208 
1209 struct PetscScalarToPetscInt {
1210   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1211 };
1212 
1213 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1214 {
1215   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1216   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1217   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1218   cusparseStatus_t              stat;
1219   cusparseIndexBase_t           indexBase;
1220 
1221   PetscFunctionBegin;
1222   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1223   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1224   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1225   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1226   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1227   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1228   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1229   PetscCall(PetscLogGpuTimeBegin());
1230   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1231   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1232     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1233     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1234     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1235     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1236     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1237 
1238     /* set alpha and beta */
1239     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1240     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1241     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1242     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1243     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1244     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1245 
1246     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247       CsrMatrix *matrixT      = new CsrMatrix;
1248       matstructT->mat         = matrixT;
1249       matrixT->num_rows       = A->cmap->n;
1250       matrixT->num_cols       = A->rmap->n;
1251       matrixT->num_entries    = a->nz;
1252       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1253       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1254       matrixT->values         = new THRUSTARRAY(a->nz);
1255 
1256       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1257       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1258 
1259 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1260   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1261       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262                                indexBase, cusparse_scalartype);
1263       PetscCallCUSPARSE(stat);
1264   #else
1265       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1266            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1267 
1268            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1269            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1270            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1271         */
1272       if (matrixT->num_entries) {
1273         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1274         PetscCallCUSPARSE(stat);
1275 
1276       } else {
1277         matstructT->matDescr = NULL;
1278         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1279       }
1280   #endif
1281 #endif
1282     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1283 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1284       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1285 #else
1286       CsrMatrix *temp  = new CsrMatrix;
1287       CsrMatrix *tempT = new CsrMatrix;
1288       /* First convert HYB to CSR */
1289       temp->num_rows       = A->rmap->n;
1290       temp->num_cols       = A->cmap->n;
1291       temp->num_entries    = a->nz;
1292       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1293       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1294       temp->values         = new THRUSTARRAY(a->nz);
1295 
1296       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1297       PetscCallCUSPARSE(stat);
1298 
1299       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1300       tempT->num_rows       = A->rmap->n;
1301       tempT->num_cols       = A->cmap->n;
1302       tempT->num_entries    = a->nz;
1303       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1304       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1305       tempT->values         = new THRUSTARRAY(a->nz);
1306 
1307       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1308                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1309       PetscCallCUSPARSE(stat);
1310 
1311       /* Last, convert CSC to HYB */
1312       cusparseHybMat_t hybMat;
1313       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1314       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1315       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1316       PetscCallCUSPARSE(stat);
1317 
1318       /* assign the pointer */
1319       matstructT->mat = hybMat;
1320       A->transupdated = PETSC_TRUE;
1321       /* delete temporaries */
1322       if (tempT) {
1323         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1324         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1325         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1326         delete (CsrMatrix *)tempT;
1327       }
1328       if (temp) {
1329         if (temp->values) delete (THRUSTARRAY *)temp->values;
1330         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1331         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1332         delete (CsrMatrix *)temp;
1333       }
1334 #endif
1335     }
1336   }
1337   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1338     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1339     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1340     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1341     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1342     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1343     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1344     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1345     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1346     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1347     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1348     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1349       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1350       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1351       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1352     }
1353     if (!cusparsestruct->csr2csc_i) {
1354       THRUSTARRAY csr2csc_a(matrix->num_entries);
1355       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1356 
1357       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1358 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1359       void  *csr2cscBuffer;
1360       size_t csr2cscBufferSize;
1361       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1362                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1363       PetscCallCUSPARSE(stat);
1364       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1365 #endif
1366 
1367       if (matrix->num_entries) {
1368         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1369            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1370            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1371 
1372            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1373            should be filled with indexBase. So I just take a shortcut here.
1374         */
1375         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1376 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1377                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1378         PetscCallCUSPARSE(stat);
1379 #else
1380                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1381         PetscCallCUSPARSE(stat);
1382 #endif
1383       } else {
1384         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1385       }
1386 
1387       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1388       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1389 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1390       PetscCallCUDA(cudaFree(csr2cscBuffer));
1391 #endif
1392     }
1393     PetscCallThrust(
1394       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1395   }
1396   PetscCall(PetscLogGpuTimeEnd());
1397   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1398   /* the compressed row indices is not used for matTranspose */
1399   matstructT->cprowIndices = NULL;
1400   /* assign the pointer */
1401   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1402   A->transupdated                                = PETSC_TRUE;
1403   PetscFunctionReturn(PETSC_SUCCESS);
1404 }
1405 
1406 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1407 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1408 {
1409   const PetscScalar                    *barray;
1410   PetscScalar                          *xarray;
1411   thrust::device_ptr<const PetscScalar> bGPU;
1412   thrust::device_ptr<PetscScalar>       xGPU;
1413   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1414   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1415   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1416   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1417   PetscInt                              m   = A->rmap->n;
1418 
1419   PetscFunctionBegin;
1420   PetscCall(PetscLogGpuTimeBegin());
1421   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1422   PetscCall(VecCUDAGetArrayRead(b, &barray));
1423   xGPU = thrust::device_pointer_cast(xarray);
1424   bGPU = thrust::device_pointer_cast(barray);
1425 
1426   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1427   if (fs->rpermIndices) {
1428     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1429     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1430   } else {
1431     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1432   }
1433 
1434   // Solve L Y = X
1435   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1436   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1437   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1438 
1439   // Solve U X = Y
1440   if (fs->cpermIndices) {
1441     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1442   } else {
1443     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1444   }
1445   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1446 
1447   // Reorder X with the column permutation if needed, and put the result back to x
1448   if (fs->cpermIndices) {
1449     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1450                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1451   }
1452   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1453   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1454   PetscCall(PetscLogGpuTimeEnd());
1455   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1456   PetscFunctionReturn(PETSC_SUCCESS);
1457 }
1458 
1459 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1460 {
1461   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1462   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1463   const PetscScalar                    *barray;
1464   PetscScalar                          *xarray;
1465   thrust::device_ptr<const PetscScalar> bGPU;
1466   thrust::device_ptr<PetscScalar>       xGPU;
1467   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1468   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1469   PetscInt                              m   = A->rmap->n;
1470 
1471   PetscFunctionBegin;
1472   PetscCall(PetscLogGpuTimeBegin());
1473   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1474     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1475     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1476                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1477 
1478     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1479     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1480     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1481     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1482     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1483   }
1484 
1485   if (!fs->updatedTransposeSpSVAnalysis) {
1486     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1487 
1488     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1489     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1490   }
1491 
1492   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1493   PetscCall(VecCUDAGetArrayRead(b, &barray));
1494   xGPU = thrust::device_pointer_cast(xarray);
1495   bGPU = thrust::device_pointer_cast(barray);
1496 
1497   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1498   if (fs->rpermIndices) {
1499     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1500     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1501   } else {
1502     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1503   }
1504 
1505   // Solve Ut Y = X
1506   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1507   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1508 
1509   // Solve Lt X = Y
1510   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1511     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1512   } else {
1513     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1514   }
1515   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1516 
1517   // Reorder X with the column permutation if needed, and put the result back to x
1518   if (fs->cpermIndices) {
1519     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1520                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1521   }
1522 
1523   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1524   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1525   PetscCall(PetscLogGpuTimeEnd());
1526   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1527   PetscFunctionReturn(PETSC_SUCCESS);
1528 }
1529 #else
1530 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1531 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1532 {
1533   PetscInt                              n = xx->map->n;
1534   const PetscScalar                    *barray;
1535   PetscScalar                          *xarray;
1536   thrust::device_ptr<const PetscScalar> bGPU;
1537   thrust::device_ptr<PetscScalar>       xGPU;
1538   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1539   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1540   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1541   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1542 
1543   PetscFunctionBegin;
1544   /* Analyze the matrix and create the transpose ... on the fly */
1545   if (!loTriFactorT && !upTriFactorT) {
1546     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1547     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1548     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1549   }
1550 
1551   /* Get the GPU pointers */
1552   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1553   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1554   xGPU = thrust::device_pointer_cast(xarray);
1555   bGPU = thrust::device_pointer_cast(barray);
1556 
1557   PetscCall(PetscLogGpuTimeBegin());
1558   /* First, reorder with the row permutation */
1559   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1560 
1561   /* First, solve U */
1562   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1563                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1564 
1565   /* Then, solve L */
1566   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1567                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1568 
1569   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1570   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1571 
1572   /* Copy the temporary to the full solution. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1574 
1575   /* restore */
1576   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1577   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1578   PetscCall(PetscLogGpuTimeEnd());
1579   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1580   PetscFunctionReturn(PETSC_SUCCESS);
1581 }
1582 
1583 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1584 {
1585   const PetscScalar                 *barray;
1586   PetscScalar                       *xarray;
1587   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1588   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1589   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1590   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1591 
1592   PetscFunctionBegin;
1593   /* Analyze the matrix and create the transpose ... on the fly */
1594   if (!loTriFactorT && !upTriFactorT) {
1595     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1596     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1597     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1598   }
1599 
1600   /* Get the GPU pointers */
1601   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1602   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1603 
1604   PetscCall(PetscLogGpuTimeBegin());
1605   /* First, solve U */
1606   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1607                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1608 
1609   /* Then, solve L */
1610   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1611                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1612 
1613   /* restore */
1614   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1615   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1616   PetscCall(PetscLogGpuTimeEnd());
1617   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1618   PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620 
1621 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1622 {
1623   const PetscScalar                    *barray;
1624   PetscScalar                          *xarray;
1625   thrust::device_ptr<const PetscScalar> bGPU;
1626   thrust::device_ptr<PetscScalar>       xGPU;
1627   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1628   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1629   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1630   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1631 
1632   PetscFunctionBegin;
1633   /* Get the GPU pointers */
1634   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1635   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1636   xGPU = thrust::device_pointer_cast(xarray);
1637   bGPU = thrust::device_pointer_cast(barray);
1638 
1639   PetscCall(PetscLogGpuTimeBegin());
1640   /* First, reorder with the row permutation */
1641   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1642 
1643   /* Next, solve L */
1644   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1645                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1646 
1647   /* Then, solve U */
1648   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1649                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1650 
1651   /* Last, reorder with the column permutation */
1652   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1653 
1654   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1655   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1656   PetscCall(PetscLogGpuTimeEnd());
1657   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1658   PetscFunctionReturn(PETSC_SUCCESS);
1659 }
1660 
1661 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1662 {
1663   const PetscScalar                 *barray;
1664   PetscScalar                       *xarray;
1665   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1667   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1668   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1669 
1670   PetscFunctionBegin;
1671   /* Get the GPU pointers */
1672   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1673   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1674 
1675   PetscCall(PetscLogGpuTimeBegin());
1676   /* First, solve L */
1677   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1678                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1679 
1680   /* Next, solve U */
1681   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1682                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1683 
1684   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1685   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1686   PetscCall(PetscLogGpuTimeEnd());
1687   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1688   PetscFunctionReturn(PETSC_SUCCESS);
1689 }
1690 #endif
1691 
1692 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1693 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1694 {
1695   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1696   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1697   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1698   CsrMatrix                    *Acsr;
1699   PetscInt                      m, nz;
1700   PetscBool                     flg;
1701 
1702   PetscFunctionBegin;
1703   if (PetscDefined(USE_DEBUG)) {
1704     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1705     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1706   }
1707 
1708   /* Copy A's value to fact */
1709   m  = fact->rmap->n;
1710   nz = aij->nz;
1711   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1712   Acsr = (CsrMatrix *)Acusp->mat->mat;
1713   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1714 
1715   PetscCall(PetscLogGpuTimeBegin());
1716   /* Factorize fact inplace */
1717   if (m)
1718     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1719                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1720   if (PetscDefined(USE_DEBUG)) {
1721     int              numerical_zero;
1722     cusparseStatus_t status;
1723     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1724     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1725   }
1726 
1727   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1728   if (fs->updatedSpSVAnalysis) {
1729     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1730     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1731   } else
1732   #endif
1733   {
1734     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1735      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1736     */
1737     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1738 
1739     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1740 
1741     fs->updatedSpSVAnalysis = PETSC_TRUE;
1742     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1743     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1744   }
1745 
1746   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1747   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1748   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1749   fact->ops->matsolve          = NULL;
1750   fact->ops->matsolvetranspose = NULL;
1751   PetscCall(PetscLogGpuTimeEnd());
1752   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1753   PetscFunctionReturn(PETSC_SUCCESS);
1754 }
1755 
1756 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1757 {
1758   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1759   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1760   PetscInt                      m, nz;
1761 
1762   PetscFunctionBegin;
1763   if (PetscDefined(USE_DEBUG)) {
1764     PetscInt  i;
1765     PetscBool flg, missing;
1766 
1767     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1768     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1769     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1770     PetscCall(MatMissingDiagonal(A, &missing, &i));
1771     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1772   }
1773 
1774   /* Free the old stale stuff */
1775   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1776 
1777   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1778      but they will not be used. Allocate them just for easy debugging.
1779    */
1780   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1781 
1782   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1783   fact->factortype             = MAT_FACTOR_ILU;
1784   fact->info.factor_mallocs    = 0;
1785   fact->info.fill_ratio_given  = info->fill;
1786   fact->info.fill_ratio_needed = 1.0;
1787 
1788   aij->row = NULL;
1789   aij->col = NULL;
1790 
1791   /* ====================================================================== */
1792   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1793   /* We'll do in-place factorization on fact                                */
1794   /* ====================================================================== */
1795   const int *Ai, *Aj;
1796 
1797   m  = fact->rmap->n;
1798   nz = aij->nz;
1799 
1800   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1801   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1802   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1803   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1804   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1805   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1806 
1807   /* ====================================================================== */
1808   /* Create descriptors for M, L, U                                         */
1809   /* ====================================================================== */
1810   cusparseFillMode_t fillMode;
1811   cusparseDiagType_t diagType;
1812 
1813   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1814   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1815   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1816 
1817   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1818     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1819     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1820     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1821     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1822   */
1823   fillMode = CUSPARSE_FILL_MODE_LOWER;
1824   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1825   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1826   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1827   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1828 
1829   fillMode = CUSPARSE_FILL_MODE_UPPER;
1830   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1831   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1832   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1833   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1834 
1835   /* ========================================================================= */
1836   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1837   /* ========================================================================= */
1838   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1839   if (m)
1840     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1841                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1842 
1843   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1844   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1845 
1846   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1847   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1848 
1849   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1850   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1851 
1852   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1853   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1854 
1855   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1856      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1857      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1858      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1859    */
1860   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1861     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1862     fs->spsvBuffer_L = fs->factBuffer_M;
1863     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1864   } else {
1865     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1866     fs->spsvBuffer_U = fs->factBuffer_M;
1867     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1868   }
1869 
1870   /* ========================================================================== */
1871   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1872   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1873   /* ========================================================================== */
1874   int              structural_zero;
1875   cusparseStatus_t status;
1876 
1877   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1878   if (m)
1879     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1880                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1881   if (PetscDefined(USE_DEBUG)) {
1882     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1883     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1884     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1885   }
1886 
1887   /* Estimate FLOPs of the numeric factorization */
1888   {
1889     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1890     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1891     PetscLogDouble flops = 0.0;
1892 
1893     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1894     Ai    = Aseq->i;
1895     Adiag = Aseq->diag;
1896     for (PetscInt i = 0; i < m; i++) {
1897       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1898         nzRow  = Ai[i + 1] - Ai[i];
1899         nzLeft = Adiag[i] - Ai[i];
1900         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1901           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1902         */
1903         nzLeft = (nzRow - 1) / 2;
1904         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1905       }
1906     }
1907     fs->numericFactFlops = flops;
1908   }
1909   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1910   PetscFunctionReturn(PETSC_SUCCESS);
1911 }
1912 
1913 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1914 {
1915   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1916   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1917   const PetscScalar            *barray;
1918   PetscScalar                  *xarray;
1919 
1920   PetscFunctionBegin;
1921   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1922   PetscCall(VecCUDAGetArrayRead(b, &barray));
1923   PetscCall(PetscLogGpuTimeBegin());
1924 
1925   /* Solve L*y = b */
1926   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1927   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1928   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1929                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1930 
1931   /* Solve Lt*x = y */
1932   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1933   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1934                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1935 
1936   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1937   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1938 
1939   PetscCall(PetscLogGpuTimeEnd());
1940   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1941   PetscFunctionReturn(PETSC_SUCCESS);
1942 }
1943 
1944 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1945 {
1946   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1947   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1948   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1949   CsrMatrix                    *Acsr;
1950   PetscInt                      m, nz;
1951   PetscBool                     flg;
1952 
1953   PetscFunctionBegin;
1954   if (PetscDefined(USE_DEBUG)) {
1955     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1956     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1957   }
1958 
1959   /* Copy A's value to fact */
1960   m  = fact->rmap->n;
1961   nz = aij->nz;
1962   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1963   Acsr = (CsrMatrix *)Acusp->mat->mat;
1964   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1965 
1966   /* Factorize fact inplace */
1967   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1968      csric02() only takes the lower triangular part of matrix A to perform factorization.
1969      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1970      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1971      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1972    */
1973   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1974   if (PetscDefined(USE_DEBUG)) {
1975     int              numerical_zero;
1976     cusparseStatus_t status;
1977     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1978     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1979   }
1980 
1981   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1982   if (fs->updatedSpSVAnalysis) {
1983     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1984     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1985   } else
1986   #endif
1987   {
1988     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1989 
1990     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1991     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1992   */
1993     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1994     fs->updatedSpSVAnalysis = PETSC_TRUE;
1995   }
1996 
1997   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1998   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1999   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2000   fact->ops->matsolve          = NULL;
2001   fact->ops->matsolvetranspose = NULL;
2002   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2003   PetscFunctionReturn(PETSC_SUCCESS);
2004 }
2005 
2006 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2007 {
2008   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2009   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2010   PetscInt                      m, nz;
2011 
2012   PetscFunctionBegin;
2013   if (PetscDefined(USE_DEBUG)) {
2014     PetscInt  i;
2015     PetscBool flg, missing;
2016 
2017     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2018     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2019     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2020     PetscCall(MatMissingDiagonal(A, &missing, &i));
2021     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2022   }
2023 
2024   /* Free the old stale stuff */
2025   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2026 
2027   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2028      but they will not be used. Allocate them just for easy debugging.
2029    */
2030   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2031 
2032   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2033   fact->factortype             = MAT_FACTOR_ICC;
2034   fact->info.factor_mallocs    = 0;
2035   fact->info.fill_ratio_given  = info->fill;
2036   fact->info.fill_ratio_needed = 1.0;
2037 
2038   aij->row = NULL;
2039   aij->col = NULL;
2040 
2041   /* ====================================================================== */
2042   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2043   /* We'll do in-place factorization on fact                                */
2044   /* ====================================================================== */
2045   const int *Ai, *Aj;
2046 
2047   m  = fact->rmap->n;
2048   nz = aij->nz;
2049 
2050   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2051   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2052   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2053   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2054   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2055   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2056 
2057   /* ====================================================================== */
2058   /* Create mat descriptors for M, L                                        */
2059   /* ====================================================================== */
2060   cusparseFillMode_t fillMode;
2061   cusparseDiagType_t diagType;
2062 
2063   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2064   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2065   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2066 
2067   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2068     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2069     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2070     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2071     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2072   */
2073   fillMode = CUSPARSE_FILL_MODE_LOWER;
2074   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2075   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2076   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2077   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2078 
2079   /* ========================================================================= */
2080   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2081   /* ========================================================================= */
2082   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2083   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2084 
2085   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2086   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2087 
2088   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2089   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2090 
2091   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2092   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2093 
2094   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2095   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2096 
2097   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2098      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2099    */
2100   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2101     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2102     fs->spsvBuffer_L = fs->factBuffer_M;
2103     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2104   } else {
2105     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2106     fs->spsvBuffer_Lt = fs->factBuffer_M;
2107     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2108   }
2109 
2110   /* ========================================================================== */
2111   /* Perform analysis of ic0 on M                                               */
2112   /* The lower triangular part of M has the same sparsity pattern as L          */
2113   /* ========================================================================== */
2114   int              structural_zero;
2115   cusparseStatus_t status;
2116 
2117   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2118   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2119   if (PetscDefined(USE_DEBUG)) {
2120     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2121     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2122     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2123   }
2124 
2125   /* Estimate FLOPs of the numeric factorization */
2126   {
2127     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2128     PetscInt      *Ai, nzRow, nzLeft;
2129     PetscLogDouble flops = 0.0;
2130 
2131     Ai = Aseq->i;
2132     for (PetscInt i = 0; i < m; i++) {
2133       nzRow = Ai[i + 1] - Ai[i];
2134       if (nzRow > 1) {
2135         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2136           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2137         */
2138         nzLeft = (nzRow - 1) / 2;
2139         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2140       }
2141     }
2142     fs->numericFactFlops = flops;
2143   }
2144   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 #endif
2148 
2149 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2150 {
2151   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2152   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2153 
2154   PetscFunctionBegin;
2155   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2156   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2157   B->offloadmask = PETSC_OFFLOAD_CPU;
2158 
2159   if (!cusparsestruct->use_cpu_solve) {
2160 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2161     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2162     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2163 #else
2164     /* determine which version of MatSolve needs to be used. */
2165     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2166     IS          isrow = b->row, iscol = b->col;
2167     PetscBool   row_identity, col_identity;
2168 
2169     PetscCall(ISIdentity(isrow, &row_identity));
2170     PetscCall(ISIdentity(iscol, &col_identity));
2171     if (row_identity && col_identity) {
2172       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2173       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2174     } else {
2175       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2176       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2177     }
2178 #endif
2179   }
2180   B->ops->matsolve          = NULL;
2181   B->ops->matsolvetranspose = NULL;
2182 
2183   /* get the triangular factors */
2184   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2185   PetscFunctionReturn(PETSC_SUCCESS);
2186 }
2187 
2188 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2189 {
2190   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2191 
2192   PetscFunctionBegin;
2193   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2194   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2195   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2196   PetscFunctionReturn(PETSC_SUCCESS);
2197 }
2198 
2199 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2200 {
2201   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2202 
2203   PetscFunctionBegin;
2204 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2205   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2206   if (!info->factoronhost) {
2207     PetscCall(ISIdentity(isrow, &row_identity));
2208     PetscCall(ISIdentity(iscol, &col_identity));
2209   }
2210   if (!info->levels && row_identity && col_identity) {
2211     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2212   } else
2213 #endif
2214   {
2215     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2216     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2217     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2218   }
2219   PetscFunctionReturn(PETSC_SUCCESS);
2220 }
2221 
2222 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2223 {
2224   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2225 
2226   PetscFunctionBegin;
2227 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228   PetscBool perm_identity = PETSC_FALSE;
2229   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2230   if (!info->levels && perm_identity) {
2231     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2232   } else
2233 #endif
2234   {
2235     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2236     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2237     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2238   }
2239   PetscFunctionReturn(PETSC_SUCCESS);
2240 }
2241 
2242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2243 {
2244   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2245 
2246   PetscFunctionBegin;
2247   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2248   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2249   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2250   PetscFunctionReturn(PETSC_SUCCESS);
2251 }
2252 
2253 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2254 {
2255   PetscFunctionBegin;
2256   *type = MATSOLVERCUSPARSE;
2257   PetscFunctionReturn(PETSC_SUCCESS);
2258 }
2259 
2260 /*MC
2261   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2262   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2263   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2264   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2265   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2266   algorithms are not recommended. This class does NOT support direct solver operations.
2267 
2268   Level: beginner
2269 
2270 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2271           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2272 M*/
2273 
2274 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2275 {
2276   PetscInt n = A->rmap->n;
2277 
2278   PetscFunctionBegin;
2279   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2280   PetscCall(MatSetSizes(*B, n, n, n, n));
2281   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2282   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2283 
2284   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2285   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2286     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2287     if (!A->boundtocpu) {
2288       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2289       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2290     } else {
2291       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2292       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2293     }
2294     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2295     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2296     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2297   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2298     if (!A->boundtocpu) {
2299       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2300       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2301     } else {
2302       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2303       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2304     }
2305     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2306     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2307   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2308 
2309   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2310   (*B)->canuseordering = PETSC_TRUE;
2311   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2312   PetscFunctionReturn(PETSC_SUCCESS);
2313 }
2314 
2315 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2316 {
2317   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2318   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2319 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2320   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2321 #endif
2322 
2323   PetscFunctionBegin;
2324   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2325     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2326     if (A->factortype == MAT_FACTOR_NONE) {
2327       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2328       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329     }
2330 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2331     else if (fs->csrVal) {
2332       /* We have a factorized matrix on device and are able to copy it to host */
2333       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2334     }
2335 #endif
2336     else
2337       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2338     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2339     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2340     A->offloadmask = PETSC_OFFLOAD_BOTH;
2341   }
2342   PetscFunctionReturn(PETSC_SUCCESS);
2343 }
2344 
2345 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2346 {
2347   PetscFunctionBegin;
2348   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2349   *array = ((Mat_SeqAIJ *)A->data)->a;
2350   PetscFunctionReturn(PETSC_SUCCESS);
2351 }
2352 
2353 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2354 {
2355   PetscFunctionBegin;
2356   A->offloadmask = PETSC_OFFLOAD_CPU;
2357   *array         = NULL;
2358   PetscFunctionReturn(PETSC_SUCCESS);
2359 }
2360 
2361 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2362 {
2363   PetscFunctionBegin;
2364   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2365   *array = ((Mat_SeqAIJ *)A->data)->a;
2366   PetscFunctionReturn(PETSC_SUCCESS);
2367 }
2368 
2369 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2370 {
2371   PetscFunctionBegin;
2372   *array = NULL;
2373   PetscFunctionReturn(PETSC_SUCCESS);
2374 }
2375 
2376 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377 {
2378   PetscFunctionBegin;
2379   *array = ((Mat_SeqAIJ *)A->data)->a;
2380   PetscFunctionReturn(PETSC_SUCCESS);
2381 }
2382 
2383 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2384 {
2385   PetscFunctionBegin;
2386   A->offloadmask = PETSC_OFFLOAD_CPU;
2387   *array         = NULL;
2388   PetscFunctionReturn(PETSC_SUCCESS);
2389 }
2390 
2391 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2392 {
2393   Mat_SeqAIJCUSPARSE *cusp;
2394   CsrMatrix          *matrix;
2395 
2396   PetscFunctionBegin;
2397   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2398   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2399   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2400   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2401   matrix = (CsrMatrix *)cusp->mat->mat;
2402 
2403   if (i) {
2404 #if !defined(PETSC_USE_64BIT_INDICES)
2405     *i = matrix->row_offsets->data().get();
2406 #else
2407     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2408 #endif
2409   }
2410   if (j) {
2411 #if !defined(PETSC_USE_64BIT_INDICES)
2412     *j = matrix->column_indices->data().get();
2413 #else
2414     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2415 #endif
2416   }
2417   if (a) *a = matrix->values->data().get();
2418   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2419   PetscFunctionReturn(PETSC_SUCCESS);
2420 }
2421 
2422 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2423 {
2424   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2425   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2426   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2427   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2428   cusparseStatus_t              stat;
2429   PetscBool                     both = PETSC_TRUE;
2430 
2431   PetscFunctionBegin;
2432   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2433   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2434     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2435       CsrMatrix *matrix;
2436       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2437 
2438       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2439       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2440       matrix->values->assign(a->a, a->a + a->nz);
2441       PetscCallCUDA(WaitForCUDA());
2442       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2443       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2444       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2445     } else {
2446       PetscInt nnz;
2447       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2448       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2449       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2450       delete cusparsestruct->workVector;
2451       delete cusparsestruct->rowoffsets_gpu;
2452       cusparsestruct->workVector     = NULL;
2453       cusparsestruct->rowoffsets_gpu = NULL;
2454       try {
2455         if (a->compressedrow.use) {
2456           m    = a->compressedrow.nrows;
2457           ii   = a->compressedrow.i;
2458           ridx = a->compressedrow.rindex;
2459         } else {
2460           m    = A->rmap->n;
2461           ii   = a->i;
2462           ridx = NULL;
2463         }
2464         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2465         if (!a->a) {
2466           nnz  = ii[m];
2467           both = PETSC_FALSE;
2468         } else nnz = a->nz;
2469         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2470 
2471         /* create cusparse matrix */
2472         cusparsestruct->nrows = m;
2473         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2474         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2475         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2476         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2477 
2478         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2479         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2480         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2481         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2482         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2483         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2484         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2485 
2486         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2487         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2488           /* set the matrix */
2489           CsrMatrix *mat   = new CsrMatrix;
2490           mat->num_rows    = m;
2491           mat->num_cols    = A->cmap->n;
2492           mat->num_entries = nnz;
2493           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2494           mat->row_offsets->assign(ii, ii + m + 1);
2495 
2496           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2497           mat->column_indices->assign(a->j, a->j + nnz);
2498 
2499           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2500           if (a->a) mat->values->assign(a->a, a->a + nnz);
2501 
2502           /* assign the pointer */
2503           matstruct->mat = mat;
2504 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2505           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2506             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2507                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2508             PetscCallCUSPARSE(stat);
2509           }
2510 #endif
2511         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2512 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2513           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2514 #else
2515           CsrMatrix *mat   = new CsrMatrix;
2516           mat->num_rows    = m;
2517           mat->num_cols    = A->cmap->n;
2518           mat->num_entries = nnz;
2519           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2520           mat->row_offsets->assign(ii, ii + m + 1);
2521 
2522           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2523           mat->column_indices->assign(a->j, a->j + nnz);
2524 
2525           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2526           if (a->a) mat->values->assign(a->a, a->a + nnz);
2527 
2528           cusparseHybMat_t hybMat;
2529           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2530           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2531           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2532           PetscCallCUSPARSE(stat);
2533           /* assign the pointer */
2534           matstruct->mat = hybMat;
2535 
2536           if (mat) {
2537             if (mat->values) delete (THRUSTARRAY *)mat->values;
2538             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2539             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2540             delete (CsrMatrix *)mat;
2541           }
2542 #endif
2543         }
2544 
2545         /* assign the compressed row indices */
2546         if (a->compressedrow.use) {
2547           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2548           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2549           matstruct->cprowIndices->assign(ridx, ridx + m);
2550           tmp = m;
2551         } else {
2552           cusparsestruct->workVector = NULL;
2553           matstruct->cprowIndices    = NULL;
2554           tmp                        = 0;
2555         }
2556         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2557 
2558         /* assign the pointer */
2559         cusparsestruct->mat = matstruct;
2560       } catch (char *ex) {
2561         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2562       }
2563       PetscCallCUDA(WaitForCUDA());
2564       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2565       cusparsestruct->nonzerostate = A->nonzerostate;
2566     }
2567     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2568   }
2569   PetscFunctionReturn(PETSC_SUCCESS);
2570 }
2571 
2572 struct VecCUDAPlusEquals {
2573   template <typename Tuple>
2574   __host__ __device__ void operator()(Tuple t)
2575   {
2576     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2577   }
2578 };
2579 
2580 struct VecCUDAEquals {
2581   template <typename Tuple>
2582   __host__ __device__ void operator()(Tuple t)
2583   {
2584     thrust::get<1>(t) = thrust::get<0>(t);
2585   }
2586 };
2587 
2588 struct VecCUDAEqualsReverse {
2589   template <typename Tuple>
2590   __host__ __device__ void operator()(Tuple t)
2591   {
2592     thrust::get<0>(t) = thrust::get<1>(t);
2593   }
2594 };
2595 
2596 struct MatMatCusparse {
2597   PetscBool      cisdense;
2598   PetscScalar   *Bt;
2599   Mat            X;
2600   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2601   PetscLogDouble flops;
2602   CsrMatrix     *Bcsr;
2603 
2604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2605   cusparseSpMatDescr_t matSpBDescr;
2606   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2607   cusparseDnMatDescr_t matBDescr;
2608   cusparseDnMatDescr_t matCDescr;
2609   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2610   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2611   void *dBuffer4;
2612   void *dBuffer5;
2613   #endif
2614   size_t                mmBufferSize;
2615   void                 *mmBuffer;
2616   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2617   cusparseSpGEMMDescr_t spgemmDesc;
2618 #endif
2619 };
2620 
2621 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2622 {
2623   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2624 
2625   PetscFunctionBegin;
2626   PetscCallCUDA(cudaFree(mmdata->Bt));
2627   delete mmdata->Bcsr;
2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2629   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2630   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2631   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2632   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2633   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2635   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2636   #endif
2637   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2638   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2639 #endif
2640   PetscCall(MatDestroy(&mmdata->X));
2641   PetscCall(PetscFree(data));
2642   PetscFunctionReturn(PETSC_SUCCESS);
2643 }
2644 
2645 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2646 
2647 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2648 {
2649   Mat_Product                  *product = C->product;
2650   Mat                           A, B;
2651   PetscInt                      m, n, blda, clda;
2652   PetscBool                     flg, biscuda;
2653   Mat_SeqAIJCUSPARSE           *cusp;
2654   cusparseStatus_t              stat;
2655   cusparseOperation_t           opA;
2656   const PetscScalar            *barray;
2657   PetscScalar                  *carray;
2658   MatMatCusparse               *mmdata;
2659   Mat_SeqAIJCUSPARSEMultStruct *mat;
2660   CsrMatrix                    *csrmat;
2661 
2662   PetscFunctionBegin;
2663   MatCheckProduct(C, 1);
2664   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2665   mmdata = (MatMatCusparse *)product->data;
2666   A      = product->A;
2667   B      = product->B;
2668   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2669   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2670   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2671      Instead of silently accepting the wrong answer, I prefer to raise the error */
2672   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2673   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2674   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2675   switch (product->type) {
2676   case MATPRODUCT_AB:
2677   case MATPRODUCT_PtAP:
2678     mat = cusp->mat;
2679     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2680     m   = A->rmap->n;
2681     n   = B->cmap->n;
2682     break;
2683   case MATPRODUCT_AtB:
2684     if (!A->form_explicit_transpose) {
2685       mat = cusp->mat;
2686       opA = CUSPARSE_OPERATION_TRANSPOSE;
2687     } else {
2688       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2689       mat = cusp->matTranspose;
2690       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2691     }
2692     m = A->cmap->n;
2693     n = B->cmap->n;
2694     break;
2695   case MATPRODUCT_ABt:
2696   case MATPRODUCT_RARt:
2697     mat = cusp->mat;
2698     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2699     m   = A->rmap->n;
2700     n   = B->rmap->n;
2701     break;
2702   default:
2703     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2704   }
2705   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2706   csrmat = (CsrMatrix *)mat->mat;
2707   /* if the user passed a CPU matrix, copy the data to the GPU */
2708   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2709   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2710   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2711 
2712   PetscCall(MatDenseGetLDA(B, &blda));
2713   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2714     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2715     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2716   } else {
2717     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2718     PetscCall(MatDenseGetLDA(C, &clda));
2719   }
2720 
2721   PetscCall(PetscLogGpuTimeBegin());
2722 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2723   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2724   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2725   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2726   #else
2727   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2728   #endif
2729 
2730   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2731   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2732     size_t mmBufferSize;
2733     if (mmdata->initialized && mmdata->Blda != blda) {
2734       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2735       mmdata->matBDescr = NULL;
2736     }
2737     if (!mmdata->matBDescr) {
2738       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739       mmdata->Blda = blda;
2740     }
2741 
2742     if (mmdata->initialized && mmdata->Clda != clda) {
2743       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2744       mmdata->matCDescr = NULL;
2745     }
2746     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2747       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2748       mmdata->Clda = clda;
2749     }
2750 
2751   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2752     if (matADescr) {
2753       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2754       matADescr = NULL;
2755     }
2756   #endif
2757 
2758     if (!matADescr) {
2759       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2760                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2761       PetscCallCUSPARSE(stat);
2762     }
2763 
2764     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2765 
2766     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2767       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2768       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2769       mmdata->mmBufferSize = mmBufferSize;
2770     }
2771 
2772   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2773     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2774   #endif
2775 
2776     mmdata->initialized = PETSC_TRUE;
2777   } else {
2778     /* to be safe, always update pointers of the mats */
2779     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2780     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2781     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2782   }
2783 
2784   /* do cusparseSpMM, which supports transpose on B */
2785   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2786 #else
2787   PetscInt k;
2788   /* cusparseXcsrmm does not support transpose on B */
2789   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2790     cublasHandle_t cublasv2handle;
2791     cublasStatus_t cerr;
2792 
2793     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2794     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2795     PetscCallCUBLAS(cerr);
2796     blda = B->cmap->n;
2797     k    = B->cmap->n;
2798   } else {
2799     k = B->rmap->n;
2800   }
2801 
2802   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2803   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2804   PetscCallCUSPARSE(stat);
2805 #endif
2806   PetscCall(PetscLogGpuTimeEnd());
2807   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2808   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2809   if (product->type == MATPRODUCT_RARt) {
2810     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2811     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2812   } else if (product->type == MATPRODUCT_PtAP) {
2813     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2814     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2815   } else {
2816     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2817   }
2818   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2819   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2820   PetscFunctionReturn(PETSC_SUCCESS);
2821 }
2822 
2823 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2824 {
2825   Mat_Product        *product = C->product;
2826   Mat                 A, B;
2827   PetscInt            m, n;
2828   PetscBool           cisdense, flg;
2829   MatMatCusparse     *mmdata;
2830   Mat_SeqAIJCUSPARSE *cusp;
2831 
2832   PetscFunctionBegin;
2833   MatCheckProduct(C, 1);
2834   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2835   A = product->A;
2836   B = product->B;
2837   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2838   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2839   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2840   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2841   switch (product->type) {
2842   case MATPRODUCT_AB:
2843     m = A->rmap->n;
2844     n = B->cmap->n;
2845     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2846     break;
2847   case MATPRODUCT_AtB:
2848     m = A->cmap->n;
2849     n = B->cmap->n;
2850     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2851     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2852     break;
2853   case MATPRODUCT_ABt:
2854     m = A->rmap->n;
2855     n = B->rmap->n;
2856     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2857     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2858     break;
2859   case MATPRODUCT_PtAP:
2860     m = B->cmap->n;
2861     n = B->cmap->n;
2862     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2863     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2864     break;
2865   case MATPRODUCT_RARt:
2866     m = B->rmap->n;
2867     n = B->rmap->n;
2868     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2869     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2870     break;
2871   default:
2872     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2873   }
2874   PetscCall(MatSetSizes(C, m, n, m, n));
2875   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2876   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2877   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2878 
2879   /* product data */
2880   PetscCall(PetscNew(&mmdata));
2881   mmdata->cisdense = cisdense;
2882 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2883   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2884   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2885 #endif
2886   /* for these products we need intermediate storage */
2887   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2888     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2889     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2890     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2891       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2892     } else {
2893       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2894     }
2895   }
2896   C->product->data    = mmdata;
2897   C->product->destroy = MatDestroy_MatMatCusparse;
2898 
2899   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2900   PetscFunctionReturn(PETSC_SUCCESS);
2901 }
2902 
2903 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2904 {
2905   Mat_Product                  *product = C->product;
2906   Mat                           A, B;
2907   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2908   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2909   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2910   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2911   PetscBool                     flg;
2912   cusparseStatus_t              stat;
2913   MatProductType                ptype;
2914   MatMatCusparse               *mmdata;
2915 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2916   cusparseSpMatDescr_t BmatSpDescr;
2917 #endif
2918   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2919 
2920   PetscFunctionBegin;
2921   MatCheckProduct(C, 1);
2922   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2923   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2924   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2925   mmdata = (MatMatCusparse *)C->product->data;
2926   A      = product->A;
2927   B      = product->B;
2928   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2929     mmdata->reusesym = PETSC_FALSE;
2930     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2931     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2932     Cmat = Ccusp->mat;
2933     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2934     Ccsr = (CsrMatrix *)Cmat->mat;
2935     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2936     goto finalize;
2937   }
2938   if (!c->nz) goto finalize;
2939   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2940   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2941   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2942   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2943   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2944   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2946   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2947   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2948   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2949   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2950   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2951   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2952   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2953 
2954   ptype = product->type;
2955   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2956     ptype = MATPRODUCT_AB;
2957     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2958   }
2959   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2960     ptype = MATPRODUCT_AB;
2961     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2962   }
2963   switch (ptype) {
2964   case MATPRODUCT_AB:
2965     Amat = Acusp->mat;
2966     Bmat = Bcusp->mat;
2967     break;
2968   case MATPRODUCT_AtB:
2969     Amat = Acusp->matTranspose;
2970     Bmat = Bcusp->mat;
2971     break;
2972   case MATPRODUCT_ABt:
2973     Amat = Acusp->mat;
2974     Bmat = Bcusp->matTranspose;
2975     break;
2976   default:
2977     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2978   }
2979   Cmat = Ccusp->mat;
2980   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2981   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2982   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2983   Acsr = (CsrMatrix *)Amat->mat;
2984   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2985   Ccsr = (CsrMatrix *)Cmat->mat;
2986   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2987   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2988   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2989   PetscCall(PetscLogGpuTimeBegin());
2990 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2992   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2993   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2994   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2995   PetscCallCUSPARSE(stat);
2996   #else
2997   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2998   PetscCallCUSPARSE(stat);
2999   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3000   PetscCallCUSPARSE(stat);
3001   #endif
3002 #else
3003   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3004                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3005   PetscCallCUSPARSE(stat);
3006 #endif
3007   PetscCall(PetscLogGpuFlops(mmdata->flops));
3008   PetscCallCUDA(WaitForCUDA());
3009   PetscCall(PetscLogGpuTimeEnd());
3010   C->offloadmask = PETSC_OFFLOAD_GPU;
3011 finalize:
3012   /* shorter version of MatAssemblyEnd_SeqAIJ */
3013   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3014   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3015   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3016   c->reallocs = 0;
3017   C->info.mallocs += 0;
3018   C->info.nz_unneeded = 0;
3019   C->assembled = C->was_assembled = PETSC_TRUE;
3020   C->num_ass++;
3021   PetscFunctionReturn(PETSC_SUCCESS);
3022 }
3023 
3024 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3025 {
3026   Mat_Product                  *product = C->product;
3027   Mat                           A, B;
3028   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3029   Mat_SeqAIJ                   *a, *b, *c;
3030   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3031   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3032   PetscInt                      i, j, m, n, k;
3033   PetscBool                     flg;
3034   cusparseStatus_t              stat;
3035   MatProductType                ptype;
3036   MatMatCusparse               *mmdata;
3037   PetscLogDouble                flops;
3038   PetscBool                     biscompressed, ciscompressed;
3039 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3040   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3041   cusparseSpMatDescr_t BmatSpDescr;
3042 #else
3043   int cnz;
3044 #endif
3045   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3046 
3047   PetscFunctionBegin;
3048   MatCheckProduct(C, 1);
3049   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3050   A = product->A;
3051   B = product->B;
3052   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3053   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3054   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3055   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3056   a = (Mat_SeqAIJ *)A->data;
3057   b = (Mat_SeqAIJ *)B->data;
3058   /* product data */
3059   PetscCall(PetscNew(&mmdata));
3060   C->product->data    = mmdata;
3061   C->product->destroy = MatDestroy_MatMatCusparse;
3062 
3063   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3064   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3065   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3066   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3067   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3068   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069 
3070   ptype = product->type;
3071   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3072     ptype                                          = MATPRODUCT_AB;
3073     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3074   }
3075   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3076     ptype                                          = MATPRODUCT_AB;
3077     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3078   }
3079   biscompressed = PETSC_FALSE;
3080   ciscompressed = PETSC_FALSE;
3081   switch (ptype) {
3082   case MATPRODUCT_AB:
3083     m    = A->rmap->n;
3084     n    = B->cmap->n;
3085     k    = A->cmap->n;
3086     Amat = Acusp->mat;
3087     Bmat = Bcusp->mat;
3088     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3089     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3090     break;
3091   case MATPRODUCT_AtB:
3092     m = A->cmap->n;
3093     n = B->cmap->n;
3094     k = A->rmap->n;
3095     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3096     Amat = Acusp->matTranspose;
3097     Bmat = Bcusp->mat;
3098     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3099     break;
3100   case MATPRODUCT_ABt:
3101     m = A->rmap->n;
3102     n = B->rmap->n;
3103     k = A->cmap->n;
3104     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3105     Amat = Acusp->mat;
3106     Bmat = Bcusp->matTranspose;
3107     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3108     break;
3109   default:
3110     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3111   }
3112 
3113   /* create cusparse matrix */
3114   PetscCall(MatSetSizes(C, m, n, m, n));
3115   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3116   c     = (Mat_SeqAIJ *)C->data;
3117   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3118   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3119   Ccsr  = new CsrMatrix;
3120 
3121   c->compressedrow.use = ciscompressed;
3122   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3123     c->compressedrow.nrows = a->compressedrow.nrows;
3124     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3125     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3126     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3127     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3128     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3129   } else {
3130     c->compressedrow.nrows  = 0;
3131     c->compressedrow.i      = NULL;
3132     c->compressedrow.rindex = NULL;
3133     Ccusp->workVector       = NULL;
3134     Cmat->cprowIndices      = NULL;
3135   }
3136   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3137   Ccusp->mat        = Cmat;
3138   Ccusp->mat->mat   = Ccsr;
3139   Ccsr->num_rows    = Ccusp->nrows;
3140   Ccsr->num_cols    = n;
3141   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3142   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3143   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3144   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3145   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3146   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3147   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3148   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3149   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3150   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3152     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3153     c->nz                = 0;
3154     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3155     Ccsr->values         = new THRUSTARRAY(c->nz);
3156     goto finalizesym;
3157   }
3158 
3159   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3160   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3161   Acsr = (CsrMatrix *)Amat->mat;
3162   if (!biscompressed) {
3163     Bcsr = (CsrMatrix *)Bmat->mat;
3164 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3165     BmatSpDescr = Bmat->matDescr;
3166 #endif
3167   } else { /* we need to use row offsets for the full matrix */
3168     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3169     Bcsr                 = new CsrMatrix;
3170     Bcsr->num_rows       = B->rmap->n;
3171     Bcsr->num_cols       = cBcsr->num_cols;
3172     Bcsr->num_entries    = cBcsr->num_entries;
3173     Bcsr->column_indices = cBcsr->column_indices;
3174     Bcsr->values         = cBcsr->values;
3175     if (!Bcusp->rowoffsets_gpu) {
3176       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3177       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3178       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3179     }
3180     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3181     mmdata->Bcsr      = Bcsr;
3182 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183     if (Bcsr->num_rows && Bcsr->num_cols) {
3184       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3185       PetscCallCUSPARSE(stat);
3186     }
3187     BmatSpDescr = mmdata->matSpBDescr;
3188 #endif
3189   }
3190   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3191   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3192   /* precompute flops count */
3193   if (ptype == MATPRODUCT_AB) {
3194     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3195       const PetscInt st = a->i[i];
3196       const PetscInt en = a->i[i + 1];
3197       for (j = st; j < en; j++) {
3198         const PetscInt brow = a->j[j];
3199         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3200       }
3201     }
3202   } else if (ptype == MATPRODUCT_AtB) {
3203     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3204       const PetscInt anzi = a->i[i + 1] - a->i[i];
3205       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3206       flops += (2. * anzi) * bnzi;
3207     }
3208   } else { /* TODO */
3209     flops = 0.;
3210   }
3211 
3212   mmdata->flops = flops;
3213   PetscCall(PetscLogGpuTimeBegin());
3214 
3215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3216   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3217   // cuda-12.2 requires non-null csrRowOffsets
3218   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3219   PetscCallCUSPARSE(stat);
3220   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3221   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3222   {
3223     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3224      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3225   */
3226     void *dBuffer1 = NULL;
3227     void *dBuffer2 = NULL;
3228     void *dBuffer3 = NULL;
3229     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3230     size_t bufferSize1 = 0;
3231     size_t bufferSize2 = 0;
3232     size_t bufferSize3 = 0;
3233     size_t bufferSize4 = 0;
3234     size_t bufferSize5 = 0;
3235 
3236     /* ask bufferSize1 bytes for external memory */
3237     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3238     PetscCallCUSPARSE(stat);
3239     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3240     /* inspect the matrices A and B to understand the memory requirement for the next step */
3241     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3242     PetscCallCUSPARSE(stat);
3243 
3244     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3245     PetscCallCUSPARSE(stat);
3246     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3247     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3248     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3249     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3250     PetscCallCUSPARSE(stat);
3251     PetscCallCUDA(cudaFree(dBuffer1));
3252     PetscCallCUDA(cudaFree(dBuffer2));
3253 
3254     /* get matrix C non-zero entries C_nnz1 */
3255     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3256     c->nz = (PetscInt)C_nnz1;
3257     /* allocate matrix C */
3258     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3259     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3260     Ccsr->values = new THRUSTARRAY(c->nz);
3261     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3262     /* update matC with the new pointers */
3263     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3264     PetscCallCUSPARSE(stat);
3265 
3266     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3267     PetscCallCUSPARSE(stat);
3268     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3269     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3270     PetscCallCUSPARSE(stat);
3271     PetscCallCUDA(cudaFree(dBuffer3));
3272     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3273     PetscCallCUSPARSE(stat);
3274     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3275   }
3276   #else
3277   size_t bufSize2;
3278   /* ask bufferSize bytes for external memory */
3279   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3280   PetscCallCUSPARSE(stat);
3281   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3282   /* inspect the matrices A and B to understand the memory requirement for the next step */
3283   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3284   PetscCallCUSPARSE(stat);
3285   /* ask bufferSize again bytes for external memory */
3286   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3287   PetscCallCUSPARSE(stat);
3288   /* The CUSPARSE documentation is not clear, nor the API
3289      We need both buffers to perform the operations properly!
3290      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3291      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3292      is stored in the descriptor! What a messy API... */
3293   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3294   /* compute the intermediate product of A * B */
3295   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3296   PetscCallCUSPARSE(stat);
3297   /* get matrix C non-zero entries C_nnz1 */
3298   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3299   c->nz = (PetscInt)C_nnz1;
3300   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3301                       mmdata->mmBufferSize / 1024));
3302   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3303   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304   Ccsr->values = new THRUSTARRAY(c->nz);
3305   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3306   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3307   PetscCallCUSPARSE(stat);
3308   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3309   PetscCallCUSPARSE(stat);
3310   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3311 #else
3312   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3313   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3314                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3315   PetscCallCUSPARSE(stat);
3316   c->nz                = cnz;
3317   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3318   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319   Ccsr->values = new THRUSTARRAY(c->nz);
3320   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3321 
3322   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3323   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3324      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3325      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3326   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3327                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3328   PetscCallCUSPARSE(stat);
3329 #endif
3330   PetscCall(PetscLogGpuFlops(mmdata->flops));
3331   PetscCall(PetscLogGpuTimeEnd());
3332 finalizesym:
3333   c->free_a = PETSC_TRUE;
3334   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3335   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3336   c->free_ij = PETSC_TRUE;
3337   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3338     PetscInt      *d_i = c->i;
3339     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3340     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3341     ii = *Ccsr->row_offsets;
3342     jj = *Ccsr->column_indices;
3343     if (ciscompressed) d_i = c->compressedrow.i;
3344     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3345     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346   } else {
3347     PetscInt *d_i = c->i;
3348     if (ciscompressed) d_i = c->compressedrow.i;
3349     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3350     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351   }
3352   if (ciscompressed) { /* need to expand host row offsets */
3353     PetscInt r = 0;
3354     c->i[0]    = 0;
3355     for (k = 0; k < c->compressedrow.nrows; k++) {
3356       const PetscInt next = c->compressedrow.rindex[k];
3357       const PetscInt old  = c->compressedrow.i[k];
3358       for (; r < next; r++) c->i[r + 1] = old;
3359     }
3360     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3361   }
3362   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3363   PetscCall(PetscMalloc1(m, &c->ilen));
3364   PetscCall(PetscMalloc1(m, &c->imax));
3365   c->maxnz         = c->nz;
3366   c->nonzerorowcnt = 0;
3367   c->rmax          = 0;
3368   for (k = 0; k < m; k++) {
3369     const PetscInt nn = c->i[k + 1] - c->i[k];
3370     c->ilen[k] = c->imax[k] = nn;
3371     c->nonzerorowcnt += (PetscInt)!!nn;
3372     c->rmax = PetscMax(c->rmax, nn);
3373   }
3374   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3375   PetscCall(PetscMalloc1(c->nz, &c->a));
3376   Ccsr->num_entries = c->nz;
3377 
3378   C->nonzerostate++;
3379   PetscCall(PetscLayoutSetUp(C->rmap));
3380   PetscCall(PetscLayoutSetUp(C->cmap));
3381   Ccusp->nonzerostate = C->nonzerostate;
3382   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3383   C->preallocated     = PETSC_TRUE;
3384   C->assembled        = PETSC_FALSE;
3385   C->was_assembled    = PETSC_FALSE;
3386   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3387     mmdata->reusesym = PETSC_TRUE;
3388     C->offloadmask   = PETSC_OFFLOAD_GPU;
3389   }
3390   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3391   PetscFunctionReturn(PETSC_SUCCESS);
3392 }
3393 
3394 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3395 
3396 /* handles sparse or dense B */
3397 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3398 {
3399   Mat_Product *product = mat->product;
3400   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3401 
3402   PetscFunctionBegin;
3403   MatCheckProduct(mat, 1);
3404   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3405   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3406   if (product->type == MATPRODUCT_ABC) {
3407     Ciscusp = PETSC_FALSE;
3408     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3409   }
3410   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3411     PetscBool usecpu = PETSC_FALSE;
3412     switch (product->type) {
3413     case MATPRODUCT_AB:
3414       if (product->api_user) {
3415         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3416         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3417         PetscOptionsEnd();
3418       } else {
3419         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3420         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3421         PetscOptionsEnd();
3422       }
3423       break;
3424     case MATPRODUCT_AtB:
3425       if (product->api_user) {
3426         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3427         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3428         PetscOptionsEnd();
3429       } else {
3430         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3431         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3432         PetscOptionsEnd();
3433       }
3434       break;
3435     case MATPRODUCT_PtAP:
3436       if (product->api_user) {
3437         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3438         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3439         PetscOptionsEnd();
3440       } else {
3441         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3442         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3443         PetscOptionsEnd();
3444       }
3445       break;
3446     case MATPRODUCT_RARt:
3447       if (product->api_user) {
3448         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3449         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3450         PetscOptionsEnd();
3451       } else {
3452         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3453         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3454         PetscOptionsEnd();
3455       }
3456       break;
3457     case MATPRODUCT_ABC:
3458       if (product->api_user) {
3459         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3460         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3461         PetscOptionsEnd();
3462       } else {
3463         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3464         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3465         PetscOptionsEnd();
3466       }
3467       break;
3468     default:
3469       break;
3470     }
3471     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3472   }
3473   /* dispatch */
3474   if (isdense) {
3475     switch (product->type) {
3476     case MATPRODUCT_AB:
3477     case MATPRODUCT_AtB:
3478     case MATPRODUCT_ABt:
3479     case MATPRODUCT_PtAP:
3480     case MATPRODUCT_RARt:
3481       if (product->A->boundtocpu) {
3482         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3483       } else {
3484         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3485       }
3486       break;
3487     case MATPRODUCT_ABC:
3488       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3489       break;
3490     default:
3491       break;
3492     }
3493   } else if (Biscusp && Ciscusp) {
3494     switch (product->type) {
3495     case MATPRODUCT_AB:
3496     case MATPRODUCT_AtB:
3497     case MATPRODUCT_ABt:
3498       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3499       break;
3500     case MATPRODUCT_PtAP:
3501     case MATPRODUCT_RARt:
3502     case MATPRODUCT_ABC:
3503       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3504       break;
3505     default:
3506       break;
3507     }
3508   } else { /* fallback for AIJ */
3509     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3510   }
3511   PetscFunctionReturn(PETSC_SUCCESS);
3512 }
3513 
3514 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3515 {
3516   PetscFunctionBegin;
3517   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3518   PetscFunctionReturn(PETSC_SUCCESS);
3519 }
3520 
3521 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3522 {
3523   PetscFunctionBegin;
3524   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3525   PetscFunctionReturn(PETSC_SUCCESS);
3526 }
3527 
3528 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3529 {
3530   PetscFunctionBegin;
3531   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3532   PetscFunctionReturn(PETSC_SUCCESS);
3533 }
3534 
3535 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3536 {
3537   PetscFunctionBegin;
3538   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3539   PetscFunctionReturn(PETSC_SUCCESS);
3540 }
3541 
3542 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3543 {
3544   PetscFunctionBegin;
3545   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3546   PetscFunctionReturn(PETSC_SUCCESS);
3547 }
3548 
3549 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3550 {
3551   int i = blockIdx.x * blockDim.x + threadIdx.x;
3552   if (i < n) y[idx[i]] += x[i];
3553 }
3554 
3555 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3556 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3557 {
3558   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3559   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3560   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3561   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3562   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3563   PetscBool                     compressed;
3564 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3565   PetscInt nx, ny;
3566 #endif
3567 
3568   PetscFunctionBegin;
3569   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3570   if (!a->nz) {
3571     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3572     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3573     PetscFunctionReturn(PETSC_SUCCESS);
3574   }
3575   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3576   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3577   if (!trans) {
3578     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3580   } else {
3581     if (herm || !A->form_explicit_transpose) {
3582       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3583       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3584     } else {
3585       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3586       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3587     }
3588   }
3589   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3590   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3591 
3592   try {
3593     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3594     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3595     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3596 
3597     PetscCall(PetscLogGpuTimeBegin());
3598     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3599       /* z = A x + beta y.
3600          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3601          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3602       */
3603       xptr = xarray;
3604       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3605       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3606 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3607       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3608           allocated to accommodate different uses. So we get the length info directly from mat.
3609        */
3610       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3611         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3612         nx             = mat->num_cols; // since y = Ax
3613         ny             = mat->num_rows;
3614       }
3615 #endif
3616     } else {
3617       /* z = A^T x + beta y
3618          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3619          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3620        */
3621       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3622       dptr = zarray;
3623       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3624       if (compressed) { /* Scatter x to work vector */
3625         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3626 
3627         thrust::for_each(
3628 #if PetscDefined(HAVE_THRUST_ASYNC)
3629           thrust::cuda::par.on(PetscDefaultCudaStream),
3630 #endif
3631           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3632           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3633       }
3634 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3636         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3637         nx             = mat->num_rows; // since y = A^T x
3638         ny             = mat->num_cols;
3639       }
3640 #endif
3641     }
3642 
3643     /* csr_spmv does y = alpha op(A) x + beta y */
3644     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3645 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3646   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3647       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3648   #else
3649       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3650   #endif
3651 
3652       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3653   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3654       if (!matDescr) {
3655         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3656         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3657       }
3658   #endif
3659 
3660       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3661         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3662         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3663         PetscCallCUSPARSE(
3664           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3665         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3666   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3667         PetscCallCUSPARSE(
3668           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3669   #endif
3670         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3671       } else {
3672         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3673         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3674         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3675       }
3676 
3677       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3678 #else
3679       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3680       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3681 #endif
3682     } else {
3683       if (cusparsestruct->nrows) {
3684 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3685         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3686 #else
3687         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3688         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3689 #endif
3690       }
3691     }
3692     PetscCall(PetscLogGpuTimeEnd());
3693 
3694     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3695       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3696         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3697           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3698         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3699           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3700         }
3701       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3702         PetscCall(VecSeq_CUDA::Set(zz, 0));
3703       }
3704 
3705       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3706       if (compressed) {
3707         PetscCall(PetscLogGpuTimeBegin());
3708         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3709         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3710         PetscCall(PetscLogGpuTimeEnd());
3711       }
3712     } else {
3713       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3714     }
3715     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3716     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3717     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3718   } catch (char *ex) {
3719     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3720   }
3721   if (yy) {
3722     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3723   } else {
3724     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3725   }
3726   PetscFunctionReturn(PETSC_SUCCESS);
3727 }
3728 
3729 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3730 {
3731   PetscFunctionBegin;
3732   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3733   PetscFunctionReturn(PETSC_SUCCESS);
3734 }
3735 
3736 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3737 {
3738   PetscFunctionBegin;
3739   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3740   PetscFunctionReturn(PETSC_SUCCESS);
3741 }
3742 
3743 /*@
3744   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3745 
3746   Collective
3747 
3748   Input Parameters:
3749 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3750 . m    - number of rows
3751 . n    - number of columns
3752 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3753 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3754 
3755   Output Parameter:
3756 . A - the matrix
3757 
3758   Level: intermediate
3759 
3760   Notes:
3761   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3762   calculations. For good matrix assembly performance the user should preallocate the matrix
3763   storage by setting the parameter `nz` (or the array `nnz`).
3764 
3765   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3766   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3767   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3768 
3769   The AIJ format, also called
3770   compressed row storage, is fully compatible with standard Fortran
3771   storage.  That is, the stored row and column indices can begin at
3772   either one (as in Fortran) or zero.
3773 
3774   Specify the preallocated storage with either nz or nnz (not both).
3775   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3776   allocation.
3777 
3778   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3779 
3780 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3781           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3782 @*/
3783 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3784 {
3785   PetscFunctionBegin;
3786   PetscCall(MatCreate(comm, A));
3787   PetscCall(MatSetSizes(*A, m, n, m, n));
3788   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3789   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3790   PetscFunctionReturn(PETSC_SUCCESS);
3791 }
3792 
3793 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3794 {
3795   PetscFunctionBegin;
3796   if (A->factortype == MAT_FACTOR_NONE) {
3797     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3798   } else {
3799     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3800   }
3801   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3802   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3803   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3804   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3805   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3806   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3807   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3808   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3809   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3810   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3811   PetscCall(MatDestroy_SeqAIJ(A));
3812   PetscFunctionReturn(PETSC_SUCCESS);
3813 }
3814 
3815 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3816 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3817 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3818 {
3819   PetscFunctionBegin;
3820   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3821   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3822   PetscFunctionReturn(PETSC_SUCCESS);
3823 }
3824 
3825 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3826 {
3827   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3828   Mat_SeqAIJCUSPARSE *cy;
3829   Mat_SeqAIJCUSPARSE *cx;
3830   PetscScalar        *ay;
3831   const PetscScalar  *ax;
3832   CsrMatrix          *csry, *csrx;
3833 
3834   PetscFunctionBegin;
3835   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3836   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3837   if (X->ops->axpy != Y->ops->axpy) {
3838     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3839     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3840     PetscFunctionReturn(PETSC_SUCCESS);
3841   }
3842   /* if we are here, it means both matrices are bound to GPU */
3843   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3844   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3845   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3846   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3847   csry = (CsrMatrix *)cy->mat->mat;
3848   csrx = (CsrMatrix *)cx->mat->mat;
3849   /* see if we can turn this into a cublas axpy */
3850   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3851     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3852     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3853     if (eq) str = SAME_NONZERO_PATTERN;
3854   }
3855   /* spgeam is buggy with one column */
3856   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3857 
3858   if (str == SUBSET_NONZERO_PATTERN) {
3859     PetscScalar b = 1.0;
3860 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3861     size_t bufferSize;
3862     void  *buffer;
3863 #endif
3864 
3865     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3866     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3867     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3868 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3869     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3870                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3871     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3872     PetscCall(PetscLogGpuTimeBegin());
3873     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3874                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3875     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3876     PetscCall(PetscLogGpuTimeEnd());
3877     PetscCallCUDA(cudaFree(buffer));
3878 #else
3879     PetscCall(PetscLogGpuTimeBegin());
3880     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3881                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3882     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3883     PetscCall(PetscLogGpuTimeEnd());
3884 #endif
3885     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3886     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3887     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3888     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3889   } else if (str == SAME_NONZERO_PATTERN) {
3890     cublasHandle_t cublasv2handle;
3891     PetscBLASInt   one = 1, bnz = 1;
3892 
3893     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3894     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3895     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3896     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3897     PetscCall(PetscLogGpuTimeBegin());
3898     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3899     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3900     PetscCall(PetscLogGpuTimeEnd());
3901     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3902     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3903     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3904   } else {
3905     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3906     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3907   }
3908   PetscFunctionReturn(PETSC_SUCCESS);
3909 }
3910 
3911 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3912 {
3913   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3914   PetscScalar   *ay;
3915   cublasHandle_t cublasv2handle;
3916   PetscBLASInt   one = 1, bnz = 1;
3917 
3918   PetscFunctionBegin;
3919   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3920   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3921   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3922   PetscCall(PetscLogGpuTimeBegin());
3923   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3924   PetscCall(PetscLogGpuFlops(bnz));
3925   PetscCall(PetscLogGpuTimeEnd());
3926   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3927   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3928   PetscFunctionReturn(PETSC_SUCCESS);
3929 }
3930 
3931 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3932 {
3933   PetscBool   both = PETSC_FALSE;
3934   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3935 
3936   PetscFunctionBegin;
3937   if (A->factortype == MAT_FACTOR_NONE) {
3938     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3939     if (spptr->mat) {
3940       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3941       if (matrix->values) {
3942         both = PETSC_TRUE;
3943         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3944       }
3945     }
3946     if (spptr->matTranspose) {
3947       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3948       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3949     }
3950   }
3951   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3952   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3953   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3954   else A->offloadmask = PETSC_OFFLOAD_CPU;
3955   PetscFunctionReturn(PETSC_SUCCESS);
3956 }
3957 
3958 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3959 {
3960   PetscFunctionBegin;
3961   *m = PETSC_MEMTYPE_CUDA;
3962   PetscFunctionReturn(PETSC_SUCCESS);
3963 }
3964 
3965 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3966 {
3967   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3968 
3969   PetscFunctionBegin;
3970   if (A->factortype != MAT_FACTOR_NONE) {
3971     A->boundtocpu = flg;
3972     PetscFunctionReturn(PETSC_SUCCESS);
3973   }
3974   if (flg) {
3975     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3976 
3977     A->ops->scale                     = MatScale_SeqAIJ;
3978     A->ops->axpy                      = MatAXPY_SeqAIJ;
3979     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3980     A->ops->mult                      = MatMult_SeqAIJ;
3981     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3982     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3983     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3984     A->ops->multhermitiantranspose    = NULL;
3985     A->ops->multhermitiantransposeadd = NULL;
3986     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3987     A->ops->getcurrentmemtype         = NULL;
3988     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3989     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3990     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3991     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3992     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3993     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3994     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3995   } else {
3996     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3997     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3998     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3999     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4000     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4001     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4002     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4003     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4004     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4005     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4006     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4007     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4008     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4009     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4010     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4011     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4012     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4013     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4014 
4015     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4016     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4017     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4018     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4019     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4020     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4021   }
4022   A->boundtocpu = flg;
4023   if (flg && a->inode.size_csr) {
4024     a->inode.use = PETSC_TRUE;
4025   } else {
4026     a->inode.use = PETSC_FALSE;
4027   }
4028   PetscFunctionReturn(PETSC_SUCCESS);
4029 }
4030 
4031 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4032 {
4033   Mat B;
4034 
4035   PetscFunctionBegin;
4036   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4037   if (reuse == MAT_INITIAL_MATRIX) {
4038     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4039   } else if (reuse == MAT_REUSE_MATRIX) {
4040     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4041   }
4042   B = *newmat;
4043 
4044   PetscCall(PetscFree(B->defaultvectype));
4045   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4046 
4047   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4048     if (B->factortype == MAT_FACTOR_NONE) {
4049       Mat_SeqAIJCUSPARSE *spptr;
4050       PetscCall(PetscNew(&spptr));
4051       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4052       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4053       spptr->format = MAT_CUSPARSE_CSR;
4054 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4055   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4056       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4057   #else
4058       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4059   #endif
4060       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4061       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4062 #endif
4063       B->spptr = spptr;
4064     } else {
4065       Mat_SeqAIJCUSPARSETriFactors *spptr;
4066 
4067       PetscCall(PetscNew(&spptr));
4068       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4069       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4070       B->spptr = spptr;
4071     }
4072     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4073   }
4074   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4075   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4076   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4077   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4078   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4079   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4080   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4081 
4082   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4083   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4084   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4085 #if defined(PETSC_HAVE_HYPRE)
4086   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4087 #endif
4088   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4089   PetscFunctionReturn(PETSC_SUCCESS);
4090 }
4091 
4092 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4093 {
4094   PetscFunctionBegin;
4095   PetscCall(MatCreate_SeqAIJ(B));
4096   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4097   PetscFunctionReturn(PETSC_SUCCESS);
4098 }
4099 
4100 /*MC
4101    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4102 
4103    Options Database Keys:
4104 +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4105 .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4106                                            Other options include ell (ellpack) or hyb (hybrid).
4107 .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4108 -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4109 
4110   Level: beginner
4111 
4112   Notes:
4113   These matrices can be in either CSR, ELL, or HYB format.
4114 
4115   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4116 
4117   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4118   if some integer values passed in do not fit in `int`.
4119 
4120 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4121 M*/
4122 
4123 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4124 {
4125   PetscFunctionBegin;
4126   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4127   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4128   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4129   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4130   PetscFunctionReturn(PETSC_SUCCESS);
4131 }
4132 
4133 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4134 {
4135   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4136 
4137   PetscFunctionBegin;
4138   if (cusp) {
4139     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4140     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4141     delete cusp->workVector;
4142     delete cusp->rowoffsets_gpu;
4143     delete cusp->csr2csc_i;
4144     delete cusp->coords;
4145     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4146     PetscCall(PetscFree(mat->spptr));
4147   }
4148   PetscFunctionReturn(PETSC_SUCCESS);
4149 }
4150 
4151 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4152 {
4153   PetscFunctionBegin;
4154   if (*mat) {
4155     delete (*mat)->values;
4156     delete (*mat)->column_indices;
4157     delete (*mat)->row_offsets;
4158     delete *mat;
4159     *mat = 0;
4160   }
4161   PetscFunctionReturn(PETSC_SUCCESS);
4162 }
4163 
4164 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4165 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4166 {
4167   PetscFunctionBegin;
4168   if (*trifactor) {
4169     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4170     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4171     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4172     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4173     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4174   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4175     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4176   #endif
4177     PetscCall(PetscFree(*trifactor));
4178   }
4179   PetscFunctionReturn(PETSC_SUCCESS);
4180 }
4181 #endif
4182 
4183 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4184 {
4185   CsrMatrix *mat;
4186 
4187   PetscFunctionBegin;
4188   if (*matstruct) {
4189     if ((*matstruct)->mat) {
4190       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4191 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4192         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4193 #else
4194         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4195         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4196 #endif
4197       } else {
4198         mat = (CsrMatrix *)(*matstruct)->mat;
4199         PetscCall(CsrMatrix_Destroy(&mat));
4200       }
4201     }
4202     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4203     delete (*matstruct)->cprowIndices;
4204     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4205     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4206     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4207 
4208 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4209     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4210     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4211 
4212     for (int i = 0; i < 3; i++) {
4213       if (mdata->cuSpMV[i].initialized) {
4214         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4215         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4216         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4217   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4218         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4219         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4220   #endif
4221       }
4222     }
4223 #endif
4224     delete *matstruct;
4225     *matstruct = NULL;
4226   }
4227   PetscFunctionReturn(PETSC_SUCCESS);
4228 }
4229 
4230 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4231 {
4232   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4233 
4234   PetscFunctionBegin;
4235   if (fs) {
4236 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4237     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4238     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4239     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4240     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4241     delete fs->workVector;
4242     fs->workVector = NULL;
4243 #endif
4244     delete fs->rpermIndices;
4245     delete fs->cpermIndices;
4246     fs->rpermIndices  = NULL;
4247     fs->cpermIndices  = NULL;
4248     fs->init_dev_prop = PETSC_FALSE;
4249 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4250     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4251     PetscCallCUDA(cudaFree(fs->csrColIdx));
4252     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4253     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4254     PetscCallCUDA(cudaFree(fs->csrVal));
4255     PetscCallCUDA(cudaFree(fs->diag));
4256     PetscCallCUDA(cudaFree(fs->X));
4257     PetscCallCUDA(cudaFree(fs->Y));
4258     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4259     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4260     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4261     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4262     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4263     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4264     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4265     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4266     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4267     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4268     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4269     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4270     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4271     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4272     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4273     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4274     PetscCall(PetscFree(fs->csrRowPtr_h));
4275     PetscCall(PetscFree(fs->csrVal_h));
4276     PetscCall(PetscFree(fs->diag_h));
4277     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4278     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4279 #endif
4280   }
4281   PetscFunctionReturn(PETSC_SUCCESS);
4282 }
4283 
4284 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4285 {
4286   PetscFunctionBegin;
4287   if (*trifactors) {
4288     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4289     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4290     PetscCall(PetscFree(*trifactors));
4291   }
4292   PetscFunctionReturn(PETSC_SUCCESS);
4293 }
4294 
4295 struct IJCompare {
4296   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4297   {
4298     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4299     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4300     return false;
4301   }
4302 };
4303 
4304 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4305 {
4306   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4307 
4308   PetscFunctionBegin;
4309   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4310   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4311   if (destroy) {
4312     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4313     delete cusp->csr2csc_i;
4314     cusp->csr2csc_i = NULL;
4315   }
4316   A->transupdated = PETSC_FALSE;
4317   PetscFunctionReturn(PETSC_SUCCESS);
4318 }
4319 
4320 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4321 {
4322   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4323 
4324   PetscFunctionBegin;
4325   PetscCallCUDA(cudaFree(coo->perm));
4326   PetscCallCUDA(cudaFree(coo->jmap));
4327   PetscCall(PetscFree(coo));
4328   PetscFunctionReturn(PETSC_SUCCESS);
4329 }
4330 
4331 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4332 {
4333   PetscBool            dev_ij = PETSC_FALSE;
4334   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4335   PetscInt            *i, *j;
4336   PetscContainer       container_h;
4337   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4338 
4339   PetscFunctionBegin;
4340   PetscCall(PetscGetMemType(coo_i, &mtype));
4341   if (PetscMemTypeDevice(mtype)) {
4342     dev_ij = PETSC_TRUE;
4343     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4344     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4345     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4346   } else {
4347     i = coo_i;
4348     j = coo_j;
4349   }
4350 
4351   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4352   if (dev_ij) PetscCall(PetscFree2(i, j));
4353   mat->offloadmask = PETSC_OFFLOAD_CPU;
4354   // Create the GPU memory
4355   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4356 
4357   // Copy the COO struct to device
4358   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4359   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4360   PetscCall(PetscMalloc1(1, &coo_d));
4361   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4362   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4363   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4364   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4365   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4366 
4367   // Put the COO struct in a container and then attach that to the matrix
4368   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4369   PetscFunctionReturn(PETSC_SUCCESS);
4370 }
4371 
4372 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4373 {
4374   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4375   const PetscCount grid_size = gridDim.x * blockDim.x;
4376   for (; i < nnz; i += grid_size) {
4377     PetscScalar sum = 0.0;
4378     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4379     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4380   }
4381 }
4382 
4383 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4384 {
4385   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4386   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4387   PetscCount           Annz = seq->nz;
4388   PetscMemType         memtype;
4389   const PetscScalar   *v1 = v;
4390   PetscScalar         *Aa;
4391   PetscContainer       container;
4392   MatCOOStruct_SeqAIJ *coo;
4393 
4394   PetscFunctionBegin;
4395   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4396 
4397   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4398   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4399 
4400   PetscCall(PetscGetMemType(v, &memtype));
4401   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4402     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4403     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4404   }
4405 
4406   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4407   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4408 
4409   PetscCall(PetscLogGpuTimeBegin());
4410   if (Annz) {
4411     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4412     PetscCallCUDA(cudaPeekAtLastError());
4413   }
4414   PetscCall(PetscLogGpuTimeEnd());
4415 
4416   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4417   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4418 
4419   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4420   PetscFunctionReturn(PETSC_SUCCESS);
4421 }
4422 
4423 /*@C
4424   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4425 
4426   Not Collective
4427 
4428   Input Parameters:
4429 + A          - the matrix
4430 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4431 
4432   Output Parameters:
4433 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4434 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4435 
4436   Level: developer
4437 
4438   Note:
4439   When compressed is true, the CSR structure does not contain empty rows
4440 
4441 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4442 @*/
4443 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4444 {
4445   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4446   CsrMatrix          *csr;
4447   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4448 
4449   PetscFunctionBegin;
4450   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4451   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4452   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4453   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4454   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4455   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4456   csr = (CsrMatrix *)cusp->mat->mat;
4457   if (i) {
4458     if (!compressed && a->compressedrow.use) { /* need full row offset */
4459       if (!cusp->rowoffsets_gpu) {
4460         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4461         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4462         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4463       }
4464       *i = cusp->rowoffsets_gpu->data().get();
4465     } else *i = csr->row_offsets->data().get();
4466   }
4467   if (j) *j = csr->column_indices->data().get();
4468   PetscFunctionReturn(PETSC_SUCCESS);
4469 }
4470 
4471 /*@C
4472   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4473 
4474   Not Collective
4475 
4476   Input Parameters:
4477 + A          - the matrix
4478 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4479 . i          - the CSR row pointers
4480 - j          - the CSR column indices
4481 
4482   Level: developer
4483 
4484 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4485 @*/
4486 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4487 {
4488   PetscFunctionBegin;
4489   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4490   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4491   if (i) *i = NULL;
4492   if (j) *j = NULL;
4493   (void)compressed;
4494   PetscFunctionReturn(PETSC_SUCCESS);
4495 }
4496 
4497 /*@C
4498   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4499 
4500   Not Collective
4501 
4502   Input Parameter:
4503 . A - a `MATSEQAIJCUSPARSE` matrix
4504 
4505   Output Parameter:
4506 . a - pointer to the device data
4507 
4508   Level: developer
4509 
4510   Note:
4511   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4512 
4513 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4514 @*/
4515 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4516 {
4517   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4518   CsrMatrix          *csr;
4519 
4520   PetscFunctionBegin;
4521   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4522   PetscAssertPointer(a, 2);
4523   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4524   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4525   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4526   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4527   csr = (CsrMatrix *)cusp->mat->mat;
4528   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4529   *a = csr->values->data().get();
4530   PetscFunctionReturn(PETSC_SUCCESS);
4531 }
4532 
4533 /*@C
4534   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4535 
4536   Not Collective
4537 
4538   Input Parameters:
4539 + A - a `MATSEQAIJCUSPARSE` matrix
4540 - a - pointer to the device data
4541 
4542   Level: developer
4543 
4544 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4545 @*/
4546 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4547 {
4548   PetscFunctionBegin;
4549   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4550   PetscAssertPointer(a, 2);
4551   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4552   *a = NULL;
4553   PetscFunctionReturn(PETSC_SUCCESS);
4554 }
4555 
4556 /*@C
4557   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4558 
4559   Not Collective
4560 
4561   Input Parameter:
4562 . A - a `MATSEQAIJCUSPARSE` matrix
4563 
4564   Output Parameter:
4565 . a - pointer to the device data
4566 
4567   Level: developer
4568 
4569   Note:
4570   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4571 
4572 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4573 @*/
4574 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4575 {
4576   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4577   CsrMatrix          *csr;
4578 
4579   PetscFunctionBegin;
4580   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4581   PetscAssertPointer(a, 2);
4582   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4583   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4584   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4585   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4586   csr = (CsrMatrix *)cusp->mat->mat;
4587   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4588   *a             = csr->values->data().get();
4589   A->offloadmask = PETSC_OFFLOAD_GPU;
4590   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4591   PetscFunctionReturn(PETSC_SUCCESS);
4592 }
4593 /*@C
4594   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4595 
4596   Not Collective
4597 
4598   Input Parameters:
4599 + A - a `MATSEQAIJCUSPARSE` matrix
4600 - a - pointer to the device data
4601 
4602   Level: developer
4603 
4604 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4605 @*/
4606 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4607 {
4608   PetscFunctionBegin;
4609   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4610   PetscAssertPointer(a, 2);
4611   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4612   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4613   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4614   *a = NULL;
4615   PetscFunctionReturn(PETSC_SUCCESS);
4616 }
4617 
4618 /*@C
4619   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4620 
4621   Not Collective
4622 
4623   Input Parameter:
4624 . A - a `MATSEQAIJCUSPARSE` matrix
4625 
4626   Output Parameter:
4627 . a - pointer to the device data
4628 
4629   Level: developer
4630 
4631   Note:
4632   Does not trigger any host to device copies.
4633 
4634   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4635 
4636 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4637 @*/
4638 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4639 {
4640   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4641   CsrMatrix          *csr;
4642 
4643   PetscFunctionBegin;
4644   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4645   PetscAssertPointer(a, 2);
4646   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4647   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4648   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4649   csr = (CsrMatrix *)cusp->mat->mat;
4650   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4651   *a             = csr->values->data().get();
4652   A->offloadmask = PETSC_OFFLOAD_GPU;
4653   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4654   PetscFunctionReturn(PETSC_SUCCESS);
4655 }
4656 
4657 /*@C
4658   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4659 
4660   Not Collective
4661 
4662   Input Parameters:
4663 + A - a `MATSEQAIJCUSPARSE` matrix
4664 - a - pointer to the device data
4665 
4666   Level: developer
4667 
4668 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4669 @*/
4670 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4671 {
4672   PetscFunctionBegin;
4673   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4674   PetscAssertPointer(a, 2);
4675   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4676   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4677   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4678   *a = NULL;
4679   PetscFunctionReturn(PETSC_SUCCESS);
4680 }
4681 
4682 struct IJCompare4 {
4683   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4684   {
4685     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4686     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4687     return false;
4688   }
4689 };
4690 
4691 struct Shift {
4692   int _shift;
4693 
4694   Shift(int shift) : _shift(shift) { }
4695   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4696 };
4697 
4698 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4699 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4700 {
4701   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4702   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4703   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4704   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4705   PetscInt                      Annz, Bnnz;
4706   cusparseStatus_t              stat;
4707   PetscInt                      i, m, n, zero = 0;
4708 
4709   PetscFunctionBegin;
4710   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4711   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4712   PetscAssertPointer(C, 4);
4713   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4714   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4715   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4716   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4717   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4718   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4719   if (reuse == MAT_INITIAL_MATRIX) {
4720     m = A->rmap->n;
4721     n = A->cmap->n + B->cmap->n;
4722     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4723     PetscCall(MatSetSizes(*C, m, n, m, n));
4724     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4725     c                       = (Mat_SeqAIJ *)(*C)->data;
4726     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4727     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4728     Ccsr                    = new CsrMatrix;
4729     Cmat->cprowIndices      = NULL;
4730     c->compressedrow.use    = PETSC_FALSE;
4731     c->compressedrow.nrows  = 0;
4732     c->compressedrow.i      = NULL;
4733     c->compressedrow.rindex = NULL;
4734     Ccusp->workVector       = NULL;
4735     Ccusp->nrows            = m;
4736     Ccusp->mat              = Cmat;
4737     Ccusp->mat->mat         = Ccsr;
4738     Ccsr->num_rows          = m;
4739     Ccsr->num_cols          = n;
4740     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4741     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4742     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4743     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4744     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4745     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4746     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4747     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4748     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4749     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4750     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4751     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4752     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4753 
4754     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4755     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4756     Annz                 = (PetscInt)Acsr->column_indices->size();
4757     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4758     c->nz                = Annz + Bnnz;
4759     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4760     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4761     Ccsr->values         = new THRUSTARRAY(c->nz);
4762     Ccsr->num_entries    = c->nz;
4763     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4764     if (c->nz) {
4765       auto              Acoo = new THRUSTINTARRAY32(Annz);
4766       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4767       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4768       THRUSTINTARRAY32 *Aroff, *Broff;
4769 
4770       if (a->compressedrow.use) { /* need full row offset */
4771         if (!Acusp->rowoffsets_gpu) {
4772           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4773           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4774           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4775         }
4776         Aroff = Acusp->rowoffsets_gpu;
4777       } else Aroff = Acsr->row_offsets;
4778       if (b->compressedrow.use) { /* need full row offset */
4779         if (!Bcusp->rowoffsets_gpu) {
4780           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4781           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4782           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4783         }
4784         Broff = Bcusp->rowoffsets_gpu;
4785       } else Broff = Bcsr->row_offsets;
4786       PetscCall(PetscLogGpuTimeBegin());
4787       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4788       PetscCallCUSPARSE(stat);
4789       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4790       PetscCallCUSPARSE(stat);
4791       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4792       auto Aperm = thrust::make_constant_iterator(1);
4793       auto Bperm = thrust::make_constant_iterator(0);
4794 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4795       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4796       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4797 #else
4798       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4799       auto Bcib = Bcsr->column_indices->begin();
4800       auto Bcie = Bcsr->column_indices->end();
4801       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4802 #endif
4803       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4804       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4805       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4806       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4807       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4808       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4809       auto p1    = Ccusp->coords->begin();
4810       auto p2    = Ccusp->coords->begin();
4811 #if CCCL_VERSION >= 3001000
4812       cuda::std::advance(p2, Annz);
4813 #else
4814       thrust::advance(p2, Annz);
4815 #endif
4816       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4817 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4818       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4819 #endif
4820       auto cci = thrust::make_counting_iterator(zero);
4821       auto cce = thrust::make_counting_iterator(c->nz);
4822 #if 0 //Errors on SUMMIT cuda 11.1.0
4823       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4824 #else
4825   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4826       auto pred = thrust::identity<int>();
4827   #else
4828       auto pred = cuda::std::identity();
4829   #endif
4830       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4831       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4832 #endif
4833       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4834       PetscCallCUSPARSE(stat);
4835       PetscCall(PetscLogGpuTimeEnd());
4836       delete wPerm;
4837       delete Acoo;
4838       delete Bcoo;
4839       delete Ccoo;
4840 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4841       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4842       PetscCallCUSPARSE(stat);
4843 #endif
4844       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4845         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4846         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4847         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4848         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4849         CsrMatrix                    *CcsrT = new CsrMatrix;
4850         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4851         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4852 
4853         (*C)->form_explicit_transpose = PETSC_TRUE;
4854         (*C)->transupdated            = PETSC_TRUE;
4855         Ccusp->rowoffsets_gpu         = NULL;
4856         CmatT->cprowIndices           = NULL;
4857         CmatT->mat                    = CcsrT;
4858         CcsrT->num_rows               = n;
4859         CcsrT->num_cols               = m;
4860         CcsrT->num_entries            = c->nz;
4861 
4862         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4863         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4864         CcsrT->values         = new THRUSTARRAY(c->nz);
4865 
4866         PetscCall(PetscLogGpuTimeBegin());
4867         auto rT = CcsrT->row_offsets->begin();
4868         if (AT) {
4869           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4870 #if CCCL_VERSION >= 3001000
4871           cuda::std::advance(rT, -1);
4872 #else
4873           thrust::advance(rT, -1);
4874 #endif
4875         }
4876         if (BT) {
4877           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4878           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4879           thrust::copy(titb, tite, rT);
4880         }
4881         auto cT = CcsrT->column_indices->begin();
4882         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4883         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4884         auto vT = CcsrT->values->begin();
4885         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4886         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4887         PetscCall(PetscLogGpuTimeEnd());
4888 
4889         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4890         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4891         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4892         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4893         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4894         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4895         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4896         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4897         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4898 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4899         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4900         PetscCallCUSPARSE(stat);
4901 #endif
4902         Ccusp->matTranspose = CmatT;
4903       }
4904     }
4905 
4906     c->free_a = PETSC_TRUE;
4907     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4908     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4909     c->free_ij = PETSC_TRUE;
4910     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4911       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4912       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4913       ii = *Ccsr->row_offsets;
4914       jj = *Ccsr->column_indices;
4915       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4916       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917     } else {
4918       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4919       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4920     }
4921     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4922     PetscCall(PetscMalloc1(m, &c->ilen));
4923     PetscCall(PetscMalloc1(m, &c->imax));
4924     c->maxnz         = c->nz;
4925     c->nonzerorowcnt = 0;
4926     c->rmax          = 0;
4927     for (i = 0; i < m; i++) {
4928       const PetscInt nn = c->i[i + 1] - c->i[i];
4929       c->ilen[i] = c->imax[i] = nn;
4930       c->nonzerorowcnt += (PetscInt)!!nn;
4931       c->rmax = PetscMax(c->rmax, nn);
4932     }
4933     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4934     PetscCall(PetscMalloc1(c->nz, &c->a));
4935     (*C)->nonzerostate++;
4936     PetscCall(PetscLayoutSetUp((*C)->rmap));
4937     PetscCall(PetscLayoutSetUp((*C)->cmap));
4938     Ccusp->nonzerostate = (*C)->nonzerostate;
4939     (*C)->preallocated  = PETSC_TRUE;
4940   } else {
4941     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4942     c = (Mat_SeqAIJ *)(*C)->data;
4943     if (c->nz) {
4944       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4945       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4946       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4947       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4948       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4949       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4950       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4951       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4952       Acsr = (CsrMatrix *)Acusp->mat->mat;
4953       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4954       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4955       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4956       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4957       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4958       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4959       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4960       auto pmid = Ccusp->coords->begin();
4961 #if CCCL_VERSION >= 3001000
4962       cuda::std::advance(pmid, Acsr->num_entries);
4963 #else
4964       thrust::advance(pmid, Acsr->num_entries);
4965 #endif
4966       PetscCall(PetscLogGpuTimeBegin());
4967       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4968       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4969       thrust::for_each(zibait, zieait, VecCUDAEquals());
4970       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4971       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4972       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4973       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4974       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4975         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4976         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4977         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4978         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4979         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4980         auto       vT    = CcsrT->values->begin();
4981         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4982         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4983         (*C)->transupdated = PETSC_TRUE;
4984       }
4985       PetscCall(PetscLogGpuTimeEnd());
4986     }
4987   }
4988   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4989   (*C)->assembled     = PETSC_TRUE;
4990   (*C)->was_assembled = PETSC_FALSE;
4991   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4992   PetscFunctionReturn(PETSC_SUCCESS);
4993 }
4994 
4995 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4996 {
4997   bool               dmem;
4998   const PetscScalar *av;
4999 
5000   PetscFunctionBegin;
5001   dmem = isCudaMem(v);
5002   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5003   if (n && idx) {
5004     THRUSTINTARRAY widx(n);
5005     widx.assign(idx, idx + n);
5006     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5007 
5008     THRUSTARRAY                    *w = NULL;
5009     thrust::device_ptr<PetscScalar> dv;
5010     if (dmem) {
5011       dv = thrust::device_pointer_cast(v);
5012     } else {
5013       w  = new THRUSTARRAY(n);
5014       dv = w->data();
5015     }
5016     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5017 
5018     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5019     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5020     thrust::for_each(zibit, zieit, VecCUDAEquals());
5021     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5022     delete w;
5023   } else {
5024     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5025   }
5026   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5027   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5028   PetscFunctionReturn(PETSC_SUCCESS);
5029 }
5030