xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision cac3574093404e1e5401e84b5847fc709f0d30b9)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72 #endif
73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
83 
84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95 
96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97 {
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
99 
100   PetscFunctionBegin;
101   switch (op) {
102   case MAT_CUSPARSE_MULT:
103     cusparsestruct->format = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparsestruct->format = format;
107     break;
108   default:
109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*@
115   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116   operation. Only the `MatMult()` operation can use different GPU storage formats
117 
118   Not Collective
119 
120   Input Parameters:
121 + A      - Matrix of type `MATSEQAIJCUSPARSE`
122 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125 
126   Level: intermediate
127 
128 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@
148   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149 
150   Input Parameters:
151 + A       - Matrix of type `MATSEQAIJCUSPARSE`
152 - use_cpu - set flag for using the built-in CPU `MatSolve()`
153 
154   Level: intermediate
155 
156   Note:
157   The cuSparse LU solver currently computes the factors with the built-in CPU method
158   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162 @*/
163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164 {
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172 {
173   PetscFunctionBegin;
174   switch (op) {
175   case MAT_FORM_EXPLICIT_TRANSPOSE:
176     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178     A->form_explicit_transpose = flg;
179     break;
180   default:
181     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182     break;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188 {
189   MatCUSPARSEStorageFormat format;
190   PetscBool                flg;
191   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
192 
193   PetscFunctionBegin;
194   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195   if (A->factortype == MAT_FACTOR_NONE) {
196     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198 
199     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208   #else
209     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #endif
211     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213 
214     PetscCall(
215       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217 #endif
218   }
219   PetscOptionsHeadEnd();
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225 {
226   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227   PetscInt                      m  = A->rmap->n;
228   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230   const MatScalar              *Aa = a->a;
231   PetscInt                     *Mi, *Mj, Mnz;
232   PetscScalar                  *Ma;
233 
234   PetscFunctionBegin;
235   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239       PetscCall(PetscMalloc1(m + 1, &Mi));
240       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241       PetscCall(PetscMalloc1(Mnz, &Ma));
242       Mi[0] = 0;
243       for (PetscInt i = 0; i < m; i++) {
244         PetscInt llen = Ai[i + 1] - Ai[i];
245         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249         Mi[i + 1] = Mi[i] + llen + ulen;
250       }
251       // Copy M (L,U) from host to device
252       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
253       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
254       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
255       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
256       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
257 
258       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266 
267       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270 
271       fillMode = CUSPARSE_FILL_MODE_UPPER;
272       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276 
277       // Allocate work vectors in SpSv
278       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
279       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
280 
281       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283 
284       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291 
292       // Record for reuse
293       fs->csrRowPtr_h = Mi;
294       fs->csrVal_h    = Ma;
295       PetscCall(PetscFree(Mj));
296     }
297     // Copy the value
298     Mi  = fs->csrRowPtr_h;
299     Ma  = fs->csrVal_h;
300     Mnz = Mi[m];
301     for (PetscInt i = 0; i < m; i++) {
302       PetscInt llen = Ai[i + 1] - Ai[i];
303       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307     }
308     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309 
310     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312 
313     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314 
315     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 #else
321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322 {
323   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324   PetscInt                           n                  = A->rmap->n;
325   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328   const MatScalar                   *aa = a->a, *v;
329   PetscInt                          *AiLo, *AjLo;
330   PetscInt                           i, nz, nzLower, offset, rowOffset;
331 
332   PetscFunctionBegin;
333   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335     try {
336       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337       nzLower = n + ai[n] - ai[1];
338       if (!loTriFactor) {
339         PetscScalar *AALo;
340 
341         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
342 
343         /* Allocate Space for the lower triangular matrix */
344         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
346 
347         /* Fill the lower triangular matrix */
348         AiLo[0]   = (PetscInt)0;
349         AiLo[n]   = nzLower;
350         AjLo[0]   = (PetscInt)0;
351         AALo[0]   = (MatScalar)1.0;
352         v         = aa;
353         vi        = aj;
354         offset    = 1;
355         rowOffset = 1;
356         for (i = 1; i < n; i++) {
357           nz = ai[i + 1] - ai[i];
358           /* additional 1 for the term on the diagonal */
359           AiLo[i] = rowOffset;
360           rowOffset += nz + 1;
361 
362           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
363           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
364 
365           offset += nz;
366           AjLo[offset] = (PetscInt)i;
367           AALo[offset] = (MatScalar)1.0;
368           offset += 1;
369 
370           v += nz;
371           vi += nz;
372         }
373 
374         /* allocate space for the triangular factor information */
375         PetscCall(PetscNew(&loTriFactor));
376         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377         /* Create the matrix description */
378         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382   #else
383         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384   #endif
385         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387 
388         /* set the operation */
389         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390 
391         /* set the matrix */
392         loTriFactor->csrMat              = new CsrMatrix;
393         loTriFactor->csrMat->num_rows    = n;
394         loTriFactor->csrMat->num_cols    = n;
395         loTriFactor->csrMat->num_entries = nzLower;
396 
397         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399 
400         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402 
403         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405 
406         /* Create the solve analysis information */
407         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413   #endif
414 
415         /* perform the solve analysis */
416         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418         PetscCallCUDA(WaitForCUDA());
419         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420 
421         /* assign the pointer */
422         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423         loTriFactor->AA_h                                          = AALo;
424         PetscCallCUDA(cudaFreeHost(AiLo));
425         PetscCallCUDA(cudaFreeHost(AjLo));
426         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427       } else { /* update values only */
428         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429         /* Fill the lower triangular matrix */
430         loTriFactor->AA_h[0] = 1.0;
431         v                    = aa;
432         vi                   = aj;
433         offset               = 1;
434         for (i = 1; i < n; i++) {
435           nz = ai[i + 1] - ai[i];
436           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
437           offset += nz;
438           loTriFactor->AA_h[offset] = 1.0;
439           offset += 1;
440           v += nz;
441         }
442         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444       }
445     } catch (char *ex) {
446       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447     }
448   }
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453 {
454   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455   PetscInt                           n                  = A->rmap->n;
456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459   const MatScalar                   *aa = a->a, *v;
460   PetscInt                          *AiUp, *AjUp;
461   PetscInt                           i, nz, nzUpper, offset;
462 
463   PetscFunctionBegin;
464   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466     try {
467       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468       nzUpper = adiag[0] - adiag[n];
469       if (!upTriFactor) {
470         PetscScalar *AAUp;
471 
472         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
473 
474         /* Allocate Space for the upper triangular matrix */
475         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
477 
478         /* Fill the upper triangular matrix */
479         AiUp[0] = (PetscInt)0;
480         AiUp[n] = nzUpper;
481         offset  = nzUpper;
482         for (i = n - 1; i >= 0; i--) {
483           v  = aa + adiag[i + 1] + 1;
484           vi = aj + adiag[i + 1] + 1;
485 
486           /* number of elements NOT on the diagonal */
487           nz = adiag[i] - adiag[i + 1] - 1;
488 
489           /* decrement the offset */
490           offset -= (nz + 1);
491 
492           /* first, set the diagonal elements */
493           AjUp[offset] = (PetscInt)i;
494           AAUp[offset] = (MatScalar)1. / v[nz];
495           AiUp[i]      = AiUp[i + 1] - (nz + 1);
496 
497           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
498           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
499         }
500 
501         /* allocate space for the triangular factor information */
502         PetscCall(PetscNew(&upTriFactor));
503         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
504 
505         /* Create the matrix description */
506         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510   #else
511         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512   #endif
513         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515 
516         /* set the operation */
517         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518 
519         /* set the matrix */
520         upTriFactor->csrMat              = new CsrMatrix;
521         upTriFactor->csrMat->num_rows    = n;
522         upTriFactor->csrMat->num_cols    = n;
523         upTriFactor->csrMat->num_entries = nzUpper;
524 
525         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527 
528         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530 
531         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533 
534         /* Create the solve analysis information */
535         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541   #endif
542 
543         /* perform the solve analysis */
544         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
546 
547         PetscCallCUDA(WaitForCUDA());
548         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549 
550         /* assign the pointer */
551         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552         upTriFactor->AA_h                                          = AAUp;
553         PetscCallCUDA(cudaFreeHost(AiUp));
554         PetscCallCUDA(cudaFreeHost(AjUp));
555         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556       } else {
557         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558         /* Fill the upper triangular matrix */
559         offset = nzUpper;
560         for (i = n - 1; i >= 0; i--) {
561           v = aa + adiag[i + 1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i + 1] - 1;
565 
566           /* decrement the offset */
567           offset -= (nz + 1);
568 
569           /* first, set the diagonal elements */
570           upTriFactor->AA_h[offset] = 1. / v[nz];
571           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
572         }
573         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575       }
576     } catch (char *ex) {
577       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578     }
579   }
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 #endif
583 
584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585 {
586   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588   IS                            isrow = a->row, iscol = a->icol;
589   PetscBool                     row_identity, col_identity;
590   PetscInt                      n = A->rmap->n;
591 
592   PetscFunctionBegin;
593   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596 #else
597   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600 #endif
601 
602   cusparseTriFactors->nnz = a->nz;
603 
604   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605   /* lower triangular indices */
606   PetscCall(ISIdentity(isrow, &row_identity));
607   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608     const PetscInt *r;
609 
610     PetscCall(ISGetIndices(isrow, &r));
611     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612     cusparseTriFactors->rpermIndices->assign(r, r + n);
613     PetscCall(ISRestoreIndices(isrow, &r));
614     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615   }
616 
617   /* upper triangular indices */
618   PetscCall(ISIdentity(iscol, &col_identity));
619   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620     const PetscInt *c;
621 
622     PetscCall(ISGetIndices(iscol, &c));
623     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624     cusparseTriFactors->cpermIndices->assign(c, c + n);
625     PetscCall(ISRestoreIndices(iscol, &c));
626     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633 {
634   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635   PetscInt                      m  = A->rmap->n;
636   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638   const MatScalar              *Aa = a->a;
639   PetscInt                     *Mj, Mnz;
640   PetscScalar                  *Ma, *D;
641 
642   PetscFunctionBegin;
643   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647       Mnz = Ai[m]; // Unz (with the unit diagonal)
648       PetscCall(PetscMalloc1(Mnz, &Ma));
649       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651       for (PetscInt i = 0; i < m; i++) {
652         PetscInt ulen = Ai[i + 1] - Ai[i];
653         Mj[Ai[i]]     = i;                                              // diagonal entry
654         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655       }
656       // Copy M (U) from host to device
657       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
658       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
659       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
661       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663 
664       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672 
673       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676 
677       // Allocate work vectors in SpSv
678       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
679       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
680 
681       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683 
684       // Query buffer sizes for SpSV and then allocate buffers
685       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688 
689       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692 
693       // Record for reuse
694       fs->csrVal_h = Ma;
695       fs->diag_h   = D;
696       PetscCall(PetscFree(Mj));
697     }
698     // Copy the value
699     Ma  = fs->csrVal_h;
700     D   = fs->diag_h;
701     Mnz = Ai[m];
702     for (PetscInt i = 0; i < m; i++) {
703       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706     }
707     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709 
710     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 // Solve Ut D U x = b
718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719 {
720   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722   const PetscScalar                    *barray;
723   PetscScalar                          *xarray;
724   thrust::device_ptr<const PetscScalar> bGPU;
725   thrust::device_ptr<PetscScalar>       xGPU;
726   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727   PetscInt                              m   = A->rmap->n;
728 
729   PetscFunctionBegin;
730   PetscCall(PetscLogGpuTimeBegin());
731   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732   PetscCall(VecCUDAGetArrayRead(b, &barray));
733   xGPU = thrust::device_pointer_cast(xarray);
734   bGPU = thrust::device_pointer_cast(barray);
735 
736   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737   if (fs->rpermIndices) {
738     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740   } else {
741     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742   }
743 
744   // Solve Ut Y = X
745   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747 
748   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749   // It is basically a vector element-wise multiplication, but cublas does not have it!
750   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
817             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   PetscCall(PetscLogGpuTimeBegin());
1719   /* Factorize fact inplace */
1720   if (m)
1721     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1722                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1723   if (PetscDefined(USE_DEBUG)) {
1724     int              numerical_zero;
1725     cusparseStatus_t status;
1726     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1727     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1728   }
1729 
1730   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1731      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1732   */
1733   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1734 
1735   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1736 
1737   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1738   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1739 
1740   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1741   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1742   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1743   fact->ops->matsolve          = NULL;
1744   fact->ops->matsolvetranspose = NULL;
1745   PetscCall(PetscLogGpuTimeEnd());
1746   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1747   PetscFunctionReturn(PETSC_SUCCESS);
1748 }
1749 
1750 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1751 {
1752   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1753   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1754   PetscInt                      m, nz;
1755 
1756   PetscFunctionBegin;
1757   if (PetscDefined(USE_DEBUG)) {
1758     PetscInt  i;
1759     PetscBool flg, missing;
1760 
1761     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1762     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1763     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1764     PetscCall(MatMissingDiagonal(A, &missing, &i));
1765     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1766   }
1767 
1768   /* Free the old stale stuff */
1769   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1770 
1771   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1772      but they will not be used. Allocate them just for easy debugging.
1773    */
1774   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1775 
1776   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1777   fact->factortype             = MAT_FACTOR_ILU;
1778   fact->info.factor_mallocs    = 0;
1779   fact->info.fill_ratio_given  = info->fill;
1780   fact->info.fill_ratio_needed = 1.0;
1781 
1782   aij->row = NULL;
1783   aij->col = NULL;
1784 
1785   /* ====================================================================== */
1786   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1787   /* We'll do in-place factorization on fact                                */
1788   /* ====================================================================== */
1789   const int *Ai, *Aj;
1790 
1791   m  = fact->rmap->n;
1792   nz = aij->nz;
1793 
1794   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1795   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1796   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1797   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1798   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1799   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800 
1801   /* ====================================================================== */
1802   /* Create descriptors for M, L, U                                         */
1803   /* ====================================================================== */
1804   cusparseFillMode_t fillMode;
1805   cusparseDiagType_t diagType;
1806 
1807   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1808   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1809   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1810 
1811   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1812     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1813     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1814     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1815     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1816   */
1817   fillMode = CUSPARSE_FILL_MODE_LOWER;
1818   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1819   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1820   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1821   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1822 
1823   fillMode = CUSPARSE_FILL_MODE_UPPER;
1824   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1825   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1826   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1827   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1828 
1829   /* ========================================================================= */
1830   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1831   /* ========================================================================= */
1832   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1833   if (m)
1834     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1835                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1836 
1837   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1838   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1839 
1840   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1841   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1842 
1843   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1844   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1845 
1846   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1847   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1848 
1849   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1850      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1851      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1852      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1853    */
1854   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1855     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1856     fs->spsvBuffer_L = fs->factBuffer_M;
1857     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1858   } else {
1859     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1860     fs->spsvBuffer_U = fs->factBuffer_M;
1861     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1862   }
1863 
1864   /* ========================================================================== */
1865   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1866   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1867   /* ========================================================================== */
1868   int              structural_zero;
1869   cusparseStatus_t status;
1870 
1871   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1872   if (m)
1873     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1874                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1875   if (PetscDefined(USE_DEBUG)) {
1876     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1877     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1878     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1879   }
1880 
1881   /* Estimate FLOPs of the numeric factorization */
1882   {
1883     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1884     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1885     PetscLogDouble flops = 0.0;
1886 
1887     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1888     Ai    = Aseq->i;
1889     Adiag = Aseq->diag;
1890     for (PetscInt i = 0; i < m; i++) {
1891       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1892         nzRow  = Ai[i + 1] - Ai[i];
1893         nzLeft = Adiag[i] - Ai[i];
1894         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1895           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1896         */
1897         nzLeft = (nzRow - 1) / 2;
1898         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1899       }
1900     }
1901     fs->numericFactFlops = flops;
1902   }
1903   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1904   PetscFunctionReturn(PETSC_SUCCESS);
1905 }
1906 
1907 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1908 {
1909   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1910   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1911   const PetscScalar            *barray;
1912   PetscScalar                  *xarray;
1913 
1914   PetscFunctionBegin;
1915   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1916   PetscCall(VecCUDAGetArrayRead(b, &barray));
1917   PetscCall(PetscLogGpuTimeBegin());
1918 
1919   /* Solve L*y = b */
1920   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1921   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1922   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1923                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1924 
1925   /* Solve Lt*x = y */
1926   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1927   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1928                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1929 
1930   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1931   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1932 
1933   PetscCall(PetscLogGpuTimeEnd());
1934   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1935   PetscFunctionReturn(PETSC_SUCCESS);
1936 }
1937 
1938 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1939 {
1940   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1941   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1942   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1943   CsrMatrix                    *Acsr;
1944   PetscInt                      m, nz;
1945   PetscBool                     flg;
1946 
1947   PetscFunctionBegin;
1948   if (PetscDefined(USE_DEBUG)) {
1949     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1950     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1951   }
1952 
1953   /* Copy A's value to fact */
1954   m  = fact->rmap->n;
1955   nz = aij->nz;
1956   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1957   Acsr = (CsrMatrix *)Acusp->mat->mat;
1958   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1959 
1960   /* Factorize fact inplace */
1961   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1962      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1963      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1964      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1965      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1966    */
1967   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1968   if (PetscDefined(USE_DEBUG)) {
1969     int              numerical_zero;
1970     cusparseStatus_t status;
1971     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1972     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1973   }
1974 
1975   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1976 
1977   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1978     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1979   */
1980   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1981 
1982   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1983   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1984   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1985   fact->ops->matsolve          = NULL;
1986   fact->ops->matsolvetranspose = NULL;
1987   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1988   PetscFunctionReturn(PETSC_SUCCESS);
1989 }
1990 
1991 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1992 {
1993   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1994   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1995   PetscInt                      m, nz;
1996 
1997   PetscFunctionBegin;
1998   if (PetscDefined(USE_DEBUG)) {
1999     PetscInt  i;
2000     PetscBool flg, missing;
2001 
2002     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2003     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2004     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2005     PetscCall(MatMissingDiagonal(A, &missing, &i));
2006     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2007   }
2008 
2009   /* Free the old stale stuff */
2010   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2011 
2012   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2013      but they will not be used. Allocate them just for easy debugging.
2014    */
2015   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2016 
2017   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2018   fact->factortype             = MAT_FACTOR_ICC;
2019   fact->info.factor_mallocs    = 0;
2020   fact->info.fill_ratio_given  = info->fill;
2021   fact->info.fill_ratio_needed = 1.0;
2022 
2023   aij->row = NULL;
2024   aij->col = NULL;
2025 
2026   /* ====================================================================== */
2027   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2028   /* We'll do in-place factorization on fact                                */
2029   /* ====================================================================== */
2030   const int *Ai, *Aj;
2031 
2032   m  = fact->rmap->n;
2033   nz = aij->nz;
2034 
2035   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2036   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2037   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2038   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2039   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2040   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041 
2042   /* ====================================================================== */
2043   /* Create mat descriptors for M, L                                        */
2044   /* ====================================================================== */
2045   cusparseFillMode_t fillMode;
2046   cusparseDiagType_t diagType;
2047 
2048   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2049   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2050   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2051 
2052   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2053     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2054     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2055     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2056     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2057   */
2058   fillMode = CUSPARSE_FILL_MODE_LOWER;
2059   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2060   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2061   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2062   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2063 
2064   /* ========================================================================= */
2065   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2066   /* ========================================================================= */
2067   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2068   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2069 
2070   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2071   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2072 
2073   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2074   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2075 
2076   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2077   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2078 
2079   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2080   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2081 
2082   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2083      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2084    */
2085   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2086     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2087     fs->spsvBuffer_L = fs->factBuffer_M;
2088     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2089   } else {
2090     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2091     fs->spsvBuffer_Lt = fs->factBuffer_M;
2092     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2093   }
2094 
2095   /* ========================================================================== */
2096   /* Perform analysis of ic0 on M                                               */
2097   /* The lower triangular part of M has the same sparsity pattern as L          */
2098   /* ========================================================================== */
2099   int              structural_zero;
2100   cusparseStatus_t status;
2101 
2102   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2103   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2104   if (PetscDefined(USE_DEBUG)) {
2105     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2106     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2107     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2108   }
2109 
2110   /* Estimate FLOPs of the numeric factorization */
2111   {
2112     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2113     PetscInt      *Ai, nzRow, nzLeft;
2114     PetscLogDouble flops = 0.0;
2115 
2116     Ai = Aseq->i;
2117     for (PetscInt i = 0; i < m; i++) {
2118       nzRow = Ai[i + 1] - Ai[i];
2119       if (nzRow > 1) {
2120         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2121           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2122         */
2123         nzLeft = (nzRow - 1) / 2;
2124         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2125       }
2126     }
2127     fs->numericFactFlops = flops;
2128   }
2129   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2130   PetscFunctionReturn(PETSC_SUCCESS);
2131 }
2132 #endif
2133 
2134 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2135 {
2136   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2137   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2138 
2139   PetscFunctionBegin;
2140   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2141   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2142   B->offloadmask = PETSC_OFFLOAD_CPU;
2143 
2144   if (!cusparsestruct->use_cpu_solve) {
2145 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2146     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2147     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2148 #else
2149     /* determine which version of MatSolve needs to be used. */
2150     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2151     IS          isrow = b->row, iscol = b->col;
2152     PetscBool   row_identity, col_identity;
2153 
2154     PetscCall(ISIdentity(isrow, &row_identity));
2155     PetscCall(ISIdentity(iscol, &col_identity));
2156     if (row_identity && col_identity) {
2157       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2158       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2159     } else {
2160       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2161       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2162     }
2163 #endif
2164   }
2165   B->ops->matsolve          = NULL;
2166   B->ops->matsolvetranspose = NULL;
2167 
2168   /* get the triangular factors */
2169   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2170   PetscFunctionReturn(PETSC_SUCCESS);
2171 }
2172 
2173 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2174 {
2175   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2176 
2177   PetscFunctionBegin;
2178   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2179   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2180   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2181   PetscFunctionReturn(PETSC_SUCCESS);
2182 }
2183 
2184 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2185 {
2186   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2187 
2188   PetscFunctionBegin;
2189 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2190   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2191   if (cusparseTriFactors->factorizeOnDevice) {
2192     PetscCall(ISIdentity(isrow, &row_identity));
2193     PetscCall(ISIdentity(iscol, &col_identity));
2194   }
2195   if (!info->levels && row_identity && col_identity) {
2196     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2197   } else
2198 #endif
2199   {
2200     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2201     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2202     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2203   }
2204   PetscFunctionReturn(PETSC_SUCCESS);
2205 }
2206 
2207 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2208 {
2209   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2210 
2211   PetscFunctionBegin;
2212 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2213   PetscBool perm_identity = PETSC_FALSE;
2214   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2215   if (!info->levels && perm_identity) {
2216     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2217   } else
2218 #endif
2219   {
2220     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2221     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2222     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2223   }
2224   PetscFunctionReturn(PETSC_SUCCESS);
2225 }
2226 
2227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2228 {
2229   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2230 
2231   PetscFunctionBegin;
2232   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2233   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2234   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2235   PetscFunctionReturn(PETSC_SUCCESS);
2236 }
2237 
2238 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2239 {
2240   PetscFunctionBegin;
2241   *type = MATSOLVERCUSPARSE;
2242   PetscFunctionReturn(PETSC_SUCCESS);
2243 }
2244 
2245 /*MC
2246   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2247   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2248   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2249   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2250   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2251   algorithms are not recommended. This class does NOT support direct solver operations.
2252 
2253   Level: beginner
2254 
2255 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2256           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2257 M*/
2258 
2259 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2260 {
2261   PetscInt  n = A->rmap->n;
2262   PetscBool factOnDevice, factOnHost;
2263   char     *prefix;
2264   char      factPlace[32] = "device"; /* the default */
2265 
2266   PetscFunctionBegin;
2267   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2268   PetscCall(MatSetSizes(*B, n, n, n, n));
2269   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2270   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2271 
2272   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2273   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
2274   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2275   PetscOptionsEnd();
2276   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2277   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2278   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2279   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2280 
2281   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2282   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2283     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2284     if (!A->boundtocpu) {
2285       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2286       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2287     } else {
2288       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2289       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2290     }
2291     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2292     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2293     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2294   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2295     if (!A->boundtocpu) {
2296       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2297       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2298     } else {
2299       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2300       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2301     }
2302     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2303     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2304   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2305 
2306   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2307   (*B)->canuseordering = PETSC_TRUE;
2308   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2309   PetscFunctionReturn(PETSC_SUCCESS);
2310 }
2311 
2312 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2313 {
2314   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2315   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2316 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2317   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2318 #endif
2319 
2320   PetscFunctionBegin;
2321   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2322     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2323     if (A->factortype == MAT_FACTOR_NONE) {
2324       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2325       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2326     }
2327 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2328     else if (fs->csrVal) {
2329       /* We have a factorized matrix on device and are able to copy it to host */
2330       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331     }
2332 #endif
2333     else
2334       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2335     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2336     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2337     A->offloadmask = PETSC_OFFLOAD_BOTH;
2338   }
2339   PetscFunctionReturn(PETSC_SUCCESS);
2340 }
2341 
2342 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2343 {
2344   PetscFunctionBegin;
2345   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2346   *array = ((Mat_SeqAIJ *)A->data)->a;
2347   PetscFunctionReturn(PETSC_SUCCESS);
2348 }
2349 
2350 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2351 {
2352   PetscFunctionBegin;
2353   A->offloadmask = PETSC_OFFLOAD_CPU;
2354   *array         = NULL;
2355   PetscFunctionReturn(PETSC_SUCCESS);
2356 }
2357 
2358 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2359 {
2360   PetscFunctionBegin;
2361   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2362   *array = ((Mat_SeqAIJ *)A->data)->a;
2363   PetscFunctionReturn(PETSC_SUCCESS);
2364 }
2365 
2366 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2367 {
2368   PetscFunctionBegin;
2369   *array = NULL;
2370   PetscFunctionReturn(PETSC_SUCCESS);
2371 }
2372 
2373 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2374 {
2375   PetscFunctionBegin;
2376   *array = ((Mat_SeqAIJ *)A->data)->a;
2377   PetscFunctionReturn(PETSC_SUCCESS);
2378 }
2379 
2380 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2381 {
2382   PetscFunctionBegin;
2383   A->offloadmask = PETSC_OFFLOAD_CPU;
2384   *array         = NULL;
2385   PetscFunctionReturn(PETSC_SUCCESS);
2386 }
2387 
2388 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2389 {
2390   Mat_SeqAIJCUSPARSE *cusp;
2391   CsrMatrix          *matrix;
2392 
2393   PetscFunctionBegin;
2394   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2395   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2396   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2397   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2398   matrix = (CsrMatrix *)cusp->mat->mat;
2399 
2400   if (i) {
2401 #if !defined(PETSC_USE_64BIT_INDICES)
2402     *i = matrix->row_offsets->data().get();
2403 #else
2404     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2405 #endif
2406   }
2407   if (j) {
2408 #if !defined(PETSC_USE_64BIT_INDICES)
2409     *j = matrix->column_indices->data().get();
2410 #else
2411     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2412 #endif
2413   }
2414   if (a) *a = matrix->values->data().get();
2415   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2416   PetscFunctionReturn(PETSC_SUCCESS);
2417 }
2418 
2419 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2420 {
2421   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2422   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2423   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2424   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2425   cusparseStatus_t              stat;
2426   PetscBool                     both = PETSC_TRUE;
2427 
2428   PetscFunctionBegin;
2429   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2430   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2431     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2432       CsrMatrix *matrix;
2433       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2434 
2435       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2436       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2437       matrix->values->assign(a->a, a->a + a->nz);
2438       PetscCallCUDA(WaitForCUDA());
2439       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2440       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2441       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2442     } else {
2443       PetscInt nnz;
2444       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2445       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2446       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2447       delete cusparsestruct->workVector;
2448       delete cusparsestruct->rowoffsets_gpu;
2449       cusparsestruct->workVector     = NULL;
2450       cusparsestruct->rowoffsets_gpu = NULL;
2451       try {
2452         if (a->compressedrow.use) {
2453           m    = a->compressedrow.nrows;
2454           ii   = a->compressedrow.i;
2455           ridx = a->compressedrow.rindex;
2456         } else {
2457           m    = A->rmap->n;
2458           ii   = a->i;
2459           ridx = NULL;
2460         }
2461         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2462         if (!a->a) {
2463           nnz  = ii[m];
2464           both = PETSC_FALSE;
2465         } else nnz = a->nz;
2466         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2467 
2468         /* create cusparse matrix */
2469         cusparsestruct->nrows = m;
2470         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2471         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2472         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2473         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2474 
2475         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2477         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2478         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2481         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2482 
2483         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2484         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2485           /* set the matrix */
2486           CsrMatrix *mat   = new CsrMatrix;
2487           mat->num_rows    = m;
2488           mat->num_cols    = A->cmap->n;
2489           mat->num_entries = nnz;
2490           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2491           mat->row_offsets->assign(ii, ii + m + 1);
2492 
2493           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2494           mat->column_indices->assign(a->j, a->j + nnz);
2495 
2496           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2497           if (a->a) mat->values->assign(a->a, a->a + nnz);
2498 
2499           /* assign the pointer */
2500           matstruct->mat = mat;
2501 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2502           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2503             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2504                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2505             PetscCallCUSPARSE(stat);
2506           }
2507 #endif
2508         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2509 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2510           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2511 #else
2512           CsrMatrix *mat   = new CsrMatrix;
2513           mat->num_rows    = m;
2514           mat->num_cols    = A->cmap->n;
2515           mat->num_entries = nnz;
2516           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2517           mat->row_offsets->assign(ii, ii + m + 1);
2518 
2519           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2520           mat->column_indices->assign(a->j, a->j + nnz);
2521 
2522           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2523           if (a->a) mat->values->assign(a->a, a->a + nnz);
2524 
2525           cusparseHybMat_t hybMat;
2526           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2527           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2528           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2529           PetscCallCUSPARSE(stat);
2530           /* assign the pointer */
2531           matstruct->mat = hybMat;
2532 
2533           if (mat) {
2534             if (mat->values) delete (THRUSTARRAY *)mat->values;
2535             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2536             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2537             delete (CsrMatrix *)mat;
2538           }
2539 #endif
2540         }
2541 
2542         /* assign the compressed row indices */
2543         if (a->compressedrow.use) {
2544           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2545           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2546           matstruct->cprowIndices->assign(ridx, ridx + m);
2547           tmp = m;
2548         } else {
2549           cusparsestruct->workVector = NULL;
2550           matstruct->cprowIndices    = NULL;
2551           tmp                        = 0;
2552         }
2553         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2554 
2555         /* assign the pointer */
2556         cusparsestruct->mat = matstruct;
2557       } catch (char *ex) {
2558         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2559       }
2560       PetscCallCUDA(WaitForCUDA());
2561       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2562       cusparsestruct->nonzerostate = A->nonzerostate;
2563     }
2564     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2565   }
2566   PetscFunctionReturn(PETSC_SUCCESS);
2567 }
2568 
2569 struct VecCUDAPlusEquals {
2570   template <typename Tuple>
2571   __host__ __device__ void operator()(Tuple t)
2572   {
2573     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2574   }
2575 };
2576 
2577 struct VecCUDAEquals {
2578   template <typename Tuple>
2579   __host__ __device__ void operator()(Tuple t)
2580   {
2581     thrust::get<1>(t) = thrust::get<0>(t);
2582   }
2583 };
2584 
2585 struct VecCUDAEqualsReverse {
2586   template <typename Tuple>
2587   __host__ __device__ void operator()(Tuple t)
2588   {
2589     thrust::get<0>(t) = thrust::get<1>(t);
2590   }
2591 };
2592 
2593 struct MatMatCusparse {
2594   PetscBool      cisdense;
2595   PetscScalar   *Bt;
2596   Mat            X;
2597   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2598   PetscLogDouble flops;
2599   CsrMatrix     *Bcsr;
2600 
2601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2602   cusparseSpMatDescr_t matSpBDescr;
2603   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2604   cusparseDnMatDescr_t matBDescr;
2605   cusparseDnMatDescr_t matCDescr;
2606   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2607   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2608   void *dBuffer4;
2609   void *dBuffer5;
2610   #endif
2611   size_t                mmBufferSize;
2612   void                 *mmBuffer;
2613   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2614   cusparseSpGEMMDescr_t spgemmDesc;
2615 #endif
2616 };
2617 
2618 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2619 {
2620   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2621 
2622   PetscFunctionBegin;
2623   PetscCallCUDA(cudaFree(mmdata->Bt));
2624   delete mmdata->Bcsr;
2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2626   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2627   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2628   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2629   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2630   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2631   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2632   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2633   #endif
2634   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2635   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2636 #endif
2637   PetscCall(MatDestroy(&mmdata->X));
2638   PetscCall(PetscFree(data));
2639   PetscFunctionReturn(PETSC_SUCCESS);
2640 }
2641 
2642 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2643 
2644 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2645 {
2646   Mat_Product                  *product = C->product;
2647   Mat                           A, B;
2648   PetscInt                      m, n, blda, clda;
2649   PetscBool                     flg, biscuda;
2650   Mat_SeqAIJCUSPARSE           *cusp;
2651   cusparseStatus_t              stat;
2652   cusparseOperation_t           opA;
2653   const PetscScalar            *barray;
2654   PetscScalar                  *carray;
2655   MatMatCusparse               *mmdata;
2656   Mat_SeqAIJCUSPARSEMultStruct *mat;
2657   CsrMatrix                    *csrmat;
2658 
2659   PetscFunctionBegin;
2660   MatCheckProduct(C, 1);
2661   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2662   mmdata = (MatMatCusparse *)product->data;
2663   A      = product->A;
2664   B      = product->B;
2665   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2666   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2667   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2668      Instead of silently accepting the wrong answer, I prefer to raise the error */
2669   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2670   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2671   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2672   switch (product->type) {
2673   case MATPRODUCT_AB:
2674   case MATPRODUCT_PtAP:
2675     mat = cusp->mat;
2676     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2677     m   = A->rmap->n;
2678     n   = B->cmap->n;
2679     break;
2680   case MATPRODUCT_AtB:
2681     if (!A->form_explicit_transpose) {
2682       mat = cusp->mat;
2683       opA = CUSPARSE_OPERATION_TRANSPOSE;
2684     } else {
2685       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2686       mat = cusp->matTranspose;
2687       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2688     }
2689     m = A->cmap->n;
2690     n = B->cmap->n;
2691     break;
2692   case MATPRODUCT_ABt:
2693   case MATPRODUCT_RARt:
2694     mat = cusp->mat;
2695     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2696     m   = A->rmap->n;
2697     n   = B->rmap->n;
2698     break;
2699   default:
2700     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2701   }
2702   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2703   csrmat = (CsrMatrix *)mat->mat;
2704   /* if the user passed a CPU matrix, copy the data to the GPU */
2705   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2706   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2707   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2708 
2709   PetscCall(MatDenseGetLDA(B, &blda));
2710   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2711     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2712     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2713   } else {
2714     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2715     PetscCall(MatDenseGetLDA(C, &clda));
2716   }
2717 
2718   PetscCall(PetscLogGpuTimeBegin());
2719 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2720   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2721   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2722   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2723   #else
2724   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2725   #endif
2726 
2727   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2728   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2729     size_t mmBufferSize;
2730     if (mmdata->initialized && mmdata->Blda != blda) {
2731       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2732       mmdata->matBDescr = NULL;
2733     }
2734     if (!mmdata->matBDescr) {
2735       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2736       mmdata->Blda = blda;
2737     }
2738 
2739     if (mmdata->initialized && mmdata->Clda != clda) {
2740       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2741       mmdata->matCDescr = NULL;
2742     }
2743     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2744       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2745       mmdata->Clda = clda;
2746     }
2747 
2748   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2749     if (matADescr) {
2750       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2751       matADescr = NULL;
2752     }
2753   #endif
2754 
2755     if (!matADescr) {
2756       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2757                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2758       PetscCallCUSPARSE(stat);
2759     }
2760 
2761     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2762 
2763     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2764       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2765       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2766       mmdata->mmBufferSize = mmBufferSize;
2767     }
2768 
2769   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0
2770     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2771   #endif
2772 
2773     mmdata->initialized = PETSC_TRUE;
2774   } else {
2775     /* to be safe, always update pointers of the mats */
2776     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2777     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2778     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2779   }
2780 
2781   /* do cusparseSpMM, which supports transpose on B */
2782   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2783 #else
2784   PetscInt k;
2785   /* cusparseXcsrmm does not support transpose on B */
2786   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2787     cublasHandle_t cublasv2handle;
2788     cublasStatus_t cerr;
2789 
2790     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2791     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2792     PetscCallCUBLAS(cerr);
2793     blda = B->cmap->n;
2794     k    = B->cmap->n;
2795   } else {
2796     k = B->rmap->n;
2797   }
2798 
2799   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2800   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2801   PetscCallCUSPARSE(stat);
2802 #endif
2803   PetscCall(PetscLogGpuTimeEnd());
2804   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2805   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2806   if (product->type == MATPRODUCT_RARt) {
2807     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2808     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2809   } else if (product->type == MATPRODUCT_PtAP) {
2810     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2811     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2812   } else {
2813     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2814   }
2815   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2816   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2817   PetscFunctionReturn(PETSC_SUCCESS);
2818 }
2819 
2820 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2821 {
2822   Mat_Product        *product = C->product;
2823   Mat                 A, B;
2824   PetscInt            m, n;
2825   PetscBool           cisdense, flg;
2826   MatMatCusparse     *mmdata;
2827   Mat_SeqAIJCUSPARSE *cusp;
2828 
2829   PetscFunctionBegin;
2830   MatCheckProduct(C, 1);
2831   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2832   A = product->A;
2833   B = product->B;
2834   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2835   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2836   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2837   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2838   switch (product->type) {
2839   case MATPRODUCT_AB:
2840     m = A->rmap->n;
2841     n = B->cmap->n;
2842     break;
2843   case MATPRODUCT_AtB:
2844     m = A->cmap->n;
2845     n = B->cmap->n;
2846     break;
2847   case MATPRODUCT_ABt:
2848     m = A->rmap->n;
2849     n = B->rmap->n;
2850     break;
2851   case MATPRODUCT_PtAP:
2852     m = B->cmap->n;
2853     n = B->cmap->n;
2854     break;
2855   case MATPRODUCT_RARt:
2856     m = B->rmap->n;
2857     n = B->rmap->n;
2858     break;
2859   default:
2860     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2861   }
2862   PetscCall(MatSetSizes(C, m, n, m, n));
2863   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2864   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2865   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2866 
2867   /* product data */
2868   PetscCall(PetscNew(&mmdata));
2869   mmdata->cisdense = cisdense;
2870 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2871   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2872   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2873 #endif
2874   /* for these products we need intermediate storage */
2875   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2876     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2877     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2878     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2879       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2880     } else {
2881       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2882     }
2883   }
2884   C->product->data    = mmdata;
2885   C->product->destroy = MatDestroy_MatMatCusparse;
2886 
2887   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2888   PetscFunctionReturn(PETSC_SUCCESS);
2889 }
2890 
2891 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2892 {
2893   Mat_Product                  *product = C->product;
2894   Mat                           A, B;
2895   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2896   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2897   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2898   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2899   PetscBool                     flg;
2900   cusparseStatus_t              stat;
2901   MatProductType                ptype;
2902   MatMatCusparse               *mmdata;
2903 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2904   cusparseSpMatDescr_t BmatSpDescr;
2905 #endif
2906   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2907 
2908   PetscFunctionBegin;
2909   MatCheckProduct(C, 1);
2910   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2911   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2912   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2913   mmdata = (MatMatCusparse *)C->product->data;
2914   A      = product->A;
2915   B      = product->B;
2916   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2917     mmdata->reusesym = PETSC_FALSE;
2918     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2919     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2920     Cmat = Ccusp->mat;
2921     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2922     Ccsr = (CsrMatrix *)Cmat->mat;
2923     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2924     goto finalize;
2925   }
2926   if (!c->nz) goto finalize;
2927   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2928   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2929   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2930   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2931   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2932   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2933   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2934   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2935   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2936   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2937   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2938   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2939   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2940   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2941 
2942   ptype = product->type;
2943   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2944     ptype = MATPRODUCT_AB;
2945     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2946   }
2947   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2948     ptype = MATPRODUCT_AB;
2949     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2950   }
2951   switch (ptype) {
2952   case MATPRODUCT_AB:
2953     Amat = Acusp->mat;
2954     Bmat = Bcusp->mat;
2955     break;
2956   case MATPRODUCT_AtB:
2957     Amat = Acusp->matTranspose;
2958     Bmat = Bcusp->mat;
2959     break;
2960   case MATPRODUCT_ABt:
2961     Amat = Acusp->mat;
2962     Bmat = Bcusp->matTranspose;
2963     break;
2964   default:
2965     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2966   }
2967   Cmat = Ccusp->mat;
2968   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2969   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2970   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2971   Acsr = (CsrMatrix *)Amat->mat;
2972   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2973   Ccsr = (CsrMatrix *)Cmat->mat;
2974   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2975   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2976   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2977   PetscCall(PetscLogGpuTimeBegin());
2978 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2979   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2980   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2981   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2982   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2983   PetscCallCUSPARSE(stat);
2984   #else
2985   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2986   PetscCallCUSPARSE(stat);
2987   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2988   PetscCallCUSPARSE(stat);
2989   #endif
2990 #else
2991   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2992                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2993   PetscCallCUSPARSE(stat);
2994 #endif
2995   PetscCall(PetscLogGpuFlops(mmdata->flops));
2996   PetscCallCUDA(WaitForCUDA());
2997   PetscCall(PetscLogGpuTimeEnd());
2998   C->offloadmask = PETSC_OFFLOAD_GPU;
2999 finalize:
3000   /* shorter version of MatAssemblyEnd_SeqAIJ */
3001   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3002   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3003   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3004   c->reallocs = 0;
3005   C->info.mallocs += 0;
3006   C->info.nz_unneeded = 0;
3007   C->assembled = C->was_assembled = PETSC_TRUE;
3008   C->num_ass++;
3009   PetscFunctionReturn(PETSC_SUCCESS);
3010 }
3011 
3012 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3013 {
3014   Mat_Product                  *product = C->product;
3015   Mat                           A, B;
3016   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3017   Mat_SeqAIJ                   *a, *b, *c;
3018   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3019   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3020   PetscInt                      i, j, m, n, k;
3021   PetscBool                     flg;
3022   cusparseStatus_t              stat;
3023   MatProductType                ptype;
3024   MatMatCusparse               *mmdata;
3025   PetscLogDouble                flops;
3026   PetscBool                     biscompressed, ciscompressed;
3027 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3028   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3029   cusparseSpMatDescr_t BmatSpDescr;
3030 #else
3031   int cnz;
3032 #endif
3033   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3034 
3035   PetscFunctionBegin;
3036   MatCheckProduct(C, 1);
3037   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3038   A = product->A;
3039   B = product->B;
3040   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3041   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3042   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3043   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3044   a = (Mat_SeqAIJ *)A->data;
3045   b = (Mat_SeqAIJ *)B->data;
3046   /* product data */
3047   PetscCall(PetscNew(&mmdata));
3048   C->product->data    = mmdata;
3049   C->product->destroy = MatDestroy_MatMatCusparse;
3050 
3051   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3052   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3053   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3054   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3055   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3056   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3057 
3058   ptype = product->type;
3059   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3060     ptype                                          = MATPRODUCT_AB;
3061     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3062   }
3063   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3064     ptype                                          = MATPRODUCT_AB;
3065     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3066   }
3067   biscompressed = PETSC_FALSE;
3068   ciscompressed = PETSC_FALSE;
3069   switch (ptype) {
3070   case MATPRODUCT_AB:
3071     m    = A->rmap->n;
3072     n    = B->cmap->n;
3073     k    = A->cmap->n;
3074     Amat = Acusp->mat;
3075     Bmat = Bcusp->mat;
3076     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3077     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3078     break;
3079   case MATPRODUCT_AtB:
3080     m = A->cmap->n;
3081     n = B->cmap->n;
3082     k = A->rmap->n;
3083     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3084     Amat = Acusp->matTranspose;
3085     Bmat = Bcusp->mat;
3086     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3087     break;
3088   case MATPRODUCT_ABt:
3089     m = A->rmap->n;
3090     n = B->rmap->n;
3091     k = A->cmap->n;
3092     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3093     Amat = Acusp->mat;
3094     Bmat = Bcusp->matTranspose;
3095     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3096     break;
3097   default:
3098     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3099   }
3100 
3101   /* create cusparse matrix */
3102   PetscCall(MatSetSizes(C, m, n, m, n));
3103   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3104   c     = (Mat_SeqAIJ *)C->data;
3105   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3106   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3107   Ccsr  = new CsrMatrix;
3108 
3109   c->compressedrow.use = ciscompressed;
3110   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3111     c->compressedrow.nrows = a->compressedrow.nrows;
3112     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3113     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3114     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3115     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3116     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3117   } else {
3118     c->compressedrow.nrows  = 0;
3119     c->compressedrow.i      = NULL;
3120     c->compressedrow.rindex = NULL;
3121     Ccusp->workVector       = NULL;
3122     Cmat->cprowIndices      = NULL;
3123   }
3124   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3125   Ccusp->mat        = Cmat;
3126   Ccusp->mat->mat   = Ccsr;
3127   Ccsr->num_rows    = Ccusp->nrows;
3128   Ccsr->num_cols    = n;
3129   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3130   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3131   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3132   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3133   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3134   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3135   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3136   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3137   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3138   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3139   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3140     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3141     c->nz                = 0;
3142     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3143     Ccsr->values         = new THRUSTARRAY(c->nz);
3144     goto finalizesym;
3145   }
3146 
3147   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3148   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3149   Acsr = (CsrMatrix *)Amat->mat;
3150   if (!biscompressed) {
3151     Bcsr = (CsrMatrix *)Bmat->mat;
3152 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3153     BmatSpDescr = Bmat->matDescr;
3154 #endif
3155   } else { /* we need to use row offsets for the full matrix */
3156     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3157     Bcsr                 = new CsrMatrix;
3158     Bcsr->num_rows       = B->rmap->n;
3159     Bcsr->num_cols       = cBcsr->num_cols;
3160     Bcsr->num_entries    = cBcsr->num_entries;
3161     Bcsr->column_indices = cBcsr->column_indices;
3162     Bcsr->values         = cBcsr->values;
3163     if (!Bcusp->rowoffsets_gpu) {
3164       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3165       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3166       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3167     }
3168     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3169     mmdata->Bcsr      = Bcsr;
3170 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3171     if (Bcsr->num_rows && Bcsr->num_cols) {
3172       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3173       PetscCallCUSPARSE(stat);
3174     }
3175     BmatSpDescr = mmdata->matSpBDescr;
3176 #endif
3177   }
3178   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3179   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3180   /* precompute flops count */
3181   if (ptype == MATPRODUCT_AB) {
3182     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3183       const PetscInt st = a->i[i];
3184       const PetscInt en = a->i[i + 1];
3185       for (j = st; j < en; j++) {
3186         const PetscInt brow = a->j[j];
3187         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3188       }
3189     }
3190   } else if (ptype == MATPRODUCT_AtB) {
3191     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3192       const PetscInt anzi = a->i[i + 1] - a->i[i];
3193       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3194       flops += (2. * anzi) * bnzi;
3195     }
3196   } else { /* TODO */
3197     flops = 0.;
3198   }
3199 
3200   mmdata->flops = flops;
3201   PetscCall(PetscLogGpuTimeBegin());
3202 
3203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3204   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3205   // cuda-12.2 requires non-null csrRowOffsets
3206   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3207   PetscCallCUSPARSE(stat);
3208   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3209   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3210   {
3211     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3212      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3213   */
3214     void *dBuffer1 = NULL;
3215     void *dBuffer2 = NULL;
3216     void *dBuffer3 = NULL;
3217     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3218     size_t bufferSize1 = 0;
3219     size_t bufferSize2 = 0;
3220     size_t bufferSize3 = 0;
3221     size_t bufferSize4 = 0;
3222     size_t bufferSize5 = 0;
3223 
3224     /* ask bufferSize1 bytes for external memory */
3225     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3226     PetscCallCUSPARSE(stat);
3227     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3228     /* inspect the matrices A and B to understand the memory requirement for the next step */
3229     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3230     PetscCallCUSPARSE(stat);
3231 
3232     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3233     PetscCallCUSPARSE(stat);
3234     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3235     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3236     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3237     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3238     PetscCallCUSPARSE(stat);
3239     PetscCallCUDA(cudaFree(dBuffer1));
3240     PetscCallCUDA(cudaFree(dBuffer2));
3241 
3242     /* get matrix C non-zero entries C_nnz1 */
3243     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3244     c->nz = (PetscInt)C_nnz1;
3245     /* allocate matrix C */
3246     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3247     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3248     Ccsr->values = new THRUSTARRAY(c->nz);
3249     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3250     /* update matC with the new pointers */
3251     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3252     PetscCallCUSPARSE(stat);
3253 
3254     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3255     PetscCallCUSPARSE(stat);
3256     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3257     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3258     PetscCallCUSPARSE(stat);
3259     PetscCallCUDA(cudaFree(dBuffer3));
3260     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3261     PetscCallCUSPARSE(stat);
3262     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3263   }
3264   #else
3265   size_t bufSize2;
3266   /* ask bufferSize bytes for external memory */
3267   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3268   PetscCallCUSPARSE(stat);
3269   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3270   /* inspect the matrices A and B to understand the memory requirement for the next step */
3271   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3272   PetscCallCUSPARSE(stat);
3273   /* ask bufferSize again bytes for external memory */
3274   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3275   PetscCallCUSPARSE(stat);
3276   /* The CUSPARSE documentation is not clear, nor the API
3277      We need both buffers to perform the operations properly!
3278      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3279      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3280      is stored in the descriptor! What a messy API... */
3281   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3282   /* compute the intermediate product of A * B */
3283   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3284   PetscCallCUSPARSE(stat);
3285   /* get matrix C non-zero entries C_nnz1 */
3286   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3287   c->nz = (PetscInt)C_nnz1;
3288   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3289                       mmdata->mmBufferSize / 1024));
3290   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3291   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3292   Ccsr->values = new THRUSTARRAY(c->nz);
3293   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3294   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3295   PetscCallCUSPARSE(stat);
3296   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3297   PetscCallCUSPARSE(stat);
3298   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3299 #else
3300   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3301   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3302                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3303   PetscCallCUSPARSE(stat);
3304   c->nz                = cnz;
3305   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3306   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3307   Ccsr->values = new THRUSTARRAY(c->nz);
3308   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3309 
3310   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3311   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3312      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3313      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3314   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3315                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3316   PetscCallCUSPARSE(stat);
3317 #endif
3318   PetscCall(PetscLogGpuFlops(mmdata->flops));
3319   PetscCall(PetscLogGpuTimeEnd());
3320 finalizesym:
3321   c->singlemalloc = PETSC_FALSE;
3322   c->free_a       = PETSC_TRUE;
3323   c->free_ij      = PETSC_TRUE;
3324   PetscCall(PetscMalloc1(m + 1, &c->i));
3325   PetscCall(PetscMalloc1(c->nz, &c->j));
3326   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3327     PetscInt      *d_i = c->i;
3328     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3329     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3330     ii = *Ccsr->row_offsets;
3331     jj = *Ccsr->column_indices;
3332     if (ciscompressed) d_i = c->compressedrow.i;
3333     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3334     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3335   } else {
3336     PetscInt *d_i = c->i;
3337     if (ciscompressed) d_i = c->compressedrow.i;
3338     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3339     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3340   }
3341   if (ciscompressed) { /* need to expand host row offsets */
3342     PetscInt r = 0;
3343     c->i[0]    = 0;
3344     for (k = 0; k < c->compressedrow.nrows; k++) {
3345       const PetscInt next = c->compressedrow.rindex[k];
3346       const PetscInt old  = c->compressedrow.i[k];
3347       for (; r < next; r++) c->i[r + 1] = old;
3348     }
3349     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3350   }
3351   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3352   PetscCall(PetscMalloc1(m, &c->ilen));
3353   PetscCall(PetscMalloc1(m, &c->imax));
3354   c->maxnz         = c->nz;
3355   c->nonzerorowcnt = 0;
3356   c->rmax          = 0;
3357   for (k = 0; k < m; k++) {
3358     const PetscInt nn = c->i[k + 1] - c->i[k];
3359     c->ilen[k] = c->imax[k] = nn;
3360     c->nonzerorowcnt += (PetscInt) !!nn;
3361     c->rmax = PetscMax(c->rmax, nn);
3362   }
3363   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3364   PetscCall(PetscMalloc1(c->nz, &c->a));
3365   Ccsr->num_entries = c->nz;
3366 
3367   C->nonzerostate++;
3368   PetscCall(PetscLayoutSetUp(C->rmap));
3369   PetscCall(PetscLayoutSetUp(C->cmap));
3370   Ccusp->nonzerostate = C->nonzerostate;
3371   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3372   C->preallocated     = PETSC_TRUE;
3373   C->assembled        = PETSC_FALSE;
3374   C->was_assembled    = PETSC_FALSE;
3375   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3376     mmdata->reusesym = PETSC_TRUE;
3377     C->offloadmask   = PETSC_OFFLOAD_GPU;
3378   }
3379   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3380   PetscFunctionReturn(PETSC_SUCCESS);
3381 }
3382 
3383 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3384 
3385 /* handles sparse or dense B */
3386 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3387 {
3388   Mat_Product *product = mat->product;
3389   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3390 
3391   PetscFunctionBegin;
3392   MatCheckProduct(mat, 1);
3393   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3394   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3395   if (product->type == MATPRODUCT_ABC) {
3396     Ciscusp = PETSC_FALSE;
3397     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3398   }
3399   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3400     PetscBool usecpu = PETSC_FALSE;
3401     switch (product->type) {
3402     case MATPRODUCT_AB:
3403       if (product->api_user) {
3404         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3405         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3406         PetscOptionsEnd();
3407       } else {
3408         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3409         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3410         PetscOptionsEnd();
3411       }
3412       break;
3413     case MATPRODUCT_AtB:
3414       if (product->api_user) {
3415         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3416         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3417         PetscOptionsEnd();
3418       } else {
3419         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3420         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3421         PetscOptionsEnd();
3422       }
3423       break;
3424     case MATPRODUCT_PtAP:
3425       if (product->api_user) {
3426         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3427         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3428         PetscOptionsEnd();
3429       } else {
3430         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3431         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3432         PetscOptionsEnd();
3433       }
3434       break;
3435     case MATPRODUCT_RARt:
3436       if (product->api_user) {
3437         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3438         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3439         PetscOptionsEnd();
3440       } else {
3441         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3442         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3443         PetscOptionsEnd();
3444       }
3445       break;
3446     case MATPRODUCT_ABC:
3447       if (product->api_user) {
3448         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3449         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3450         PetscOptionsEnd();
3451       } else {
3452         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3453         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3454         PetscOptionsEnd();
3455       }
3456       break;
3457     default:
3458       break;
3459     }
3460     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3461   }
3462   /* dispatch */
3463   if (isdense) {
3464     switch (product->type) {
3465     case MATPRODUCT_AB:
3466     case MATPRODUCT_AtB:
3467     case MATPRODUCT_ABt:
3468     case MATPRODUCT_PtAP:
3469     case MATPRODUCT_RARt:
3470       if (product->A->boundtocpu) {
3471         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3472       } else {
3473         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3474       }
3475       break;
3476     case MATPRODUCT_ABC:
3477       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3478       break;
3479     default:
3480       break;
3481     }
3482   } else if (Biscusp && Ciscusp) {
3483     switch (product->type) {
3484     case MATPRODUCT_AB:
3485     case MATPRODUCT_AtB:
3486     case MATPRODUCT_ABt:
3487       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3488       break;
3489     case MATPRODUCT_PtAP:
3490     case MATPRODUCT_RARt:
3491     case MATPRODUCT_ABC:
3492       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3493       break;
3494     default:
3495       break;
3496     }
3497   } else { /* fallback for AIJ */
3498     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3499   }
3500   PetscFunctionReturn(PETSC_SUCCESS);
3501 }
3502 
3503 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3504 {
3505   PetscFunctionBegin;
3506   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3507   PetscFunctionReturn(PETSC_SUCCESS);
3508 }
3509 
3510 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3511 {
3512   PetscFunctionBegin;
3513   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3514   PetscFunctionReturn(PETSC_SUCCESS);
3515 }
3516 
3517 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3518 {
3519   PetscFunctionBegin;
3520   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3521   PetscFunctionReturn(PETSC_SUCCESS);
3522 }
3523 
3524 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3525 {
3526   PetscFunctionBegin;
3527   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3528   PetscFunctionReturn(PETSC_SUCCESS);
3529 }
3530 
3531 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3532 {
3533   PetscFunctionBegin;
3534   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3535   PetscFunctionReturn(PETSC_SUCCESS);
3536 }
3537 
3538 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3539 {
3540   int i = blockIdx.x * blockDim.x + threadIdx.x;
3541   if (i < n) y[idx[i]] += x[i];
3542 }
3543 
3544 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3545 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3546 {
3547   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3548   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3549   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3550   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3551   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3552   PetscBool                     compressed;
3553 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3554   PetscInt nx, ny;
3555 #endif
3556 
3557   PetscFunctionBegin;
3558   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3559   if (!a->nz) {
3560     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3561     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3562     PetscFunctionReturn(PETSC_SUCCESS);
3563   }
3564   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3565   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3566   if (!trans) {
3567     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3568     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3569   } else {
3570     if (herm || !A->form_explicit_transpose) {
3571       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3572       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3573     } else {
3574       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3575       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3576     }
3577   }
3578   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3579   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3580 
3581   try {
3582     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3583     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3584     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3585 
3586     PetscCall(PetscLogGpuTimeBegin());
3587     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3588       /* z = A x + beta y.
3589          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3590          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3591       */
3592       xptr = xarray;
3593       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3594       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3595 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3596       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3597           allocated to accommodate different uses. So we get the length info directly from mat.
3598        */
3599       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3600         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3601         nx             = mat->num_cols; // since y = Ax
3602         ny             = mat->num_rows;
3603       }
3604 #endif
3605     } else {
3606       /* z = A^T x + beta y
3607          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3608          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3609        */
3610       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3611       dptr = zarray;
3612       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3613       if (compressed) { /* Scatter x to work vector */
3614         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3615 
3616         thrust::for_each(
3617 #if PetscDefined(HAVE_THRUST_ASYNC)
3618           thrust::cuda::par.on(PetscDefaultCudaStream),
3619 #endif
3620           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3621           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3622       }
3623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3624       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3625         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3626         nx             = mat->num_rows; // since y = A^T x
3627         ny             = mat->num_cols;
3628       }
3629 #endif
3630     }
3631 
3632     /* csr_spmv does y = alpha op(A) x + beta y */
3633     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3636       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3637   #else
3638       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3639   #endif
3640 
3641       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3642   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3643       if (!matDescr) {
3644         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3645         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3646       }
3647   #endif
3648 
3649       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3650         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3651         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3652         PetscCallCUSPARSE(
3653           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3654         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3655   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3656         PetscCallCUSPARSE(
3657           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3658   #endif
3659         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3660       } else {
3661         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3662         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3663         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3664       }
3665 
3666       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3667 #else
3668       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3669       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3670 #endif
3671     } else {
3672       if (cusparsestruct->nrows) {
3673 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3674         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3675 #else
3676         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3677         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3678 #endif
3679       }
3680     }
3681     PetscCall(PetscLogGpuTimeEnd());
3682 
3683     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3684       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3685         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3686           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3687         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3688           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3689         }
3690       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3691         PetscCall(VecSeq_CUDA::Set(zz, 0));
3692       }
3693 
3694       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3695       if (compressed) {
3696         PetscCall(PetscLogGpuTimeBegin());
3697         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3698            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3699            prevent that. So I just add a ScatterAdd kernel.
3700          */
3701 #if 0
3702         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3703         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3704                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3705                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3706                          VecCUDAPlusEquals());
3707 #else
3708         PetscInt n = matstruct->cprowIndices->size();
3709         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3710 #endif
3711         PetscCall(PetscLogGpuTimeEnd());
3712       }
3713     } else {
3714       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3715     }
3716     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3717     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3718     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3719   } catch (char *ex) {
3720     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3721   }
3722   if (yy) {
3723     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3724   } else {
3725     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3726   }
3727   PetscFunctionReturn(PETSC_SUCCESS);
3728 }
3729 
3730 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3731 {
3732   PetscFunctionBegin;
3733   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3734   PetscFunctionReturn(PETSC_SUCCESS);
3735 }
3736 
3737 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3738 {
3739   PetscFunctionBegin;
3740   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3741   PetscFunctionReturn(PETSC_SUCCESS);
3742 }
3743 
3744 /*@
3745   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3746   (the default parallel PETSc format).
3747 
3748   Collective
3749 
3750   Input Parameters:
3751 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3752 . m    - number of rows
3753 . n    - number of columns
3754 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3755 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3756 
3757   Output Parameter:
3758 . A - the matrix
3759 
3760   Level: intermediate
3761 
3762   Notes:
3763   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3764   calculations. For good matrix assembly performance the user should preallocate the matrix
3765   storage by setting the parameter `nz` (or the array `nnz`).
3766 
3767   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3768   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3769   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3770 
3771   The AIJ format, also called
3772   compressed row storage, is fully compatible with standard Fortran
3773   storage.  That is, the stored row and column indices can begin at
3774   either one (as in Fortran) or zero.
3775 
3776   Specify the preallocated storage with either nz or nnz (not both).
3777   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3778   allocation.
3779 
3780 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3781 @*/
3782 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3783 {
3784   PetscFunctionBegin;
3785   PetscCall(MatCreate(comm, A));
3786   PetscCall(MatSetSizes(*A, m, n, m, n));
3787   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3788   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3789   PetscFunctionReturn(PETSC_SUCCESS);
3790 }
3791 
3792 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3793 {
3794   PetscFunctionBegin;
3795   if (A->factortype == MAT_FACTOR_NONE) {
3796     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3797   } else {
3798     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3799   }
3800   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3801   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3802   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3803   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3804   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3805   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3806   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3807   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3808   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3809   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3810   PetscCall(MatDestroy_SeqAIJ(A));
3811   PetscFunctionReturn(PETSC_SUCCESS);
3812 }
3813 
3814 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3815 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3816 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3817 {
3818   PetscFunctionBegin;
3819   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3820   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3821   PetscFunctionReturn(PETSC_SUCCESS);
3822 }
3823 
3824 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3825 {
3826   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3827   Mat_SeqAIJCUSPARSE *cy;
3828   Mat_SeqAIJCUSPARSE *cx;
3829   PetscScalar        *ay;
3830   const PetscScalar  *ax;
3831   CsrMatrix          *csry, *csrx;
3832 
3833   PetscFunctionBegin;
3834   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3835   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3836   if (X->ops->axpy != Y->ops->axpy) {
3837     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3838     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3839     PetscFunctionReturn(PETSC_SUCCESS);
3840   }
3841   /* if we are here, it means both matrices are bound to GPU */
3842   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3843   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3844   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3845   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3846   csry = (CsrMatrix *)cy->mat->mat;
3847   csrx = (CsrMatrix *)cx->mat->mat;
3848   /* see if we can turn this into a cublas axpy */
3849   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3850     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3851     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3852     if (eq) str = SAME_NONZERO_PATTERN;
3853   }
3854   /* spgeam is buggy with one column */
3855   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3856 
3857   if (str == SUBSET_NONZERO_PATTERN) {
3858     PetscScalar b = 1.0;
3859 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3860     size_t bufferSize;
3861     void  *buffer;
3862 #endif
3863 
3864     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3865     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3866     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3868     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3869                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3870     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3871     PetscCall(PetscLogGpuTimeBegin());
3872     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3873                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3874     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3875     PetscCall(PetscLogGpuTimeEnd());
3876     PetscCallCUDA(cudaFree(buffer));
3877 #else
3878     PetscCall(PetscLogGpuTimeBegin());
3879     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3880                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3881     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3882     PetscCall(PetscLogGpuTimeEnd());
3883 #endif
3884     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3885     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3886     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3887     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3888   } else if (str == SAME_NONZERO_PATTERN) {
3889     cublasHandle_t cublasv2handle;
3890     PetscBLASInt   one = 1, bnz = 1;
3891 
3892     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3893     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3894     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3895     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3896     PetscCall(PetscLogGpuTimeBegin());
3897     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3898     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3899     PetscCall(PetscLogGpuTimeEnd());
3900     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3901     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3902     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3903   } else {
3904     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3905     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3906   }
3907   PetscFunctionReturn(PETSC_SUCCESS);
3908 }
3909 
3910 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3911 {
3912   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3913   PetscScalar   *ay;
3914   cublasHandle_t cublasv2handle;
3915   PetscBLASInt   one = 1, bnz = 1;
3916 
3917   PetscFunctionBegin;
3918   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3919   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3920   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3921   PetscCall(PetscLogGpuTimeBegin());
3922   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3923   PetscCall(PetscLogGpuFlops(bnz));
3924   PetscCall(PetscLogGpuTimeEnd());
3925   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3926   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3927   PetscFunctionReturn(PETSC_SUCCESS);
3928 }
3929 
3930 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3931 {
3932   PetscBool   both = PETSC_FALSE;
3933   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3934 
3935   PetscFunctionBegin;
3936   if (A->factortype == MAT_FACTOR_NONE) {
3937     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3938     if (spptr->mat) {
3939       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3940       if (matrix->values) {
3941         both = PETSC_TRUE;
3942         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3943       }
3944     }
3945     if (spptr->matTranspose) {
3946       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3947       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3948     }
3949   }
3950   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3951   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3952   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3953   else A->offloadmask = PETSC_OFFLOAD_CPU;
3954   PetscFunctionReturn(PETSC_SUCCESS);
3955 }
3956 
3957 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3958 {
3959   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3960 
3961   PetscFunctionBegin;
3962   if (A->factortype != MAT_FACTOR_NONE) {
3963     A->boundtocpu = flg;
3964     PetscFunctionReturn(PETSC_SUCCESS);
3965   }
3966   if (flg) {
3967     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3968 
3969     A->ops->scale                     = MatScale_SeqAIJ;
3970     A->ops->axpy                      = MatAXPY_SeqAIJ;
3971     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3972     A->ops->mult                      = MatMult_SeqAIJ;
3973     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3974     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3975     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3976     A->ops->multhermitiantranspose    = NULL;
3977     A->ops->multhermitiantransposeadd = NULL;
3978     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3979     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3980     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3981     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3982     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3983     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3984     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3985     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3986   } else {
3987     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3988     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3989     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3990     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3991     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3992     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3993     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3994     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3995     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3996     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3997     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3998     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3999     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4000     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4001     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4002     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4003     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4004 
4005     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4006     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4007     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4008     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4009     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4010     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4011   }
4012   A->boundtocpu = flg;
4013   if (flg && a->inode.size) {
4014     a->inode.use = PETSC_TRUE;
4015   } else {
4016     a->inode.use = PETSC_FALSE;
4017   }
4018   PetscFunctionReturn(PETSC_SUCCESS);
4019 }
4020 
4021 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4022 {
4023   Mat B;
4024 
4025   PetscFunctionBegin;
4026   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4027   if (reuse == MAT_INITIAL_MATRIX) {
4028     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4029   } else if (reuse == MAT_REUSE_MATRIX) {
4030     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4031   }
4032   B = *newmat;
4033 
4034   PetscCall(PetscFree(B->defaultvectype));
4035   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4036 
4037   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4038     if (B->factortype == MAT_FACTOR_NONE) {
4039       Mat_SeqAIJCUSPARSE *spptr;
4040       PetscCall(PetscNew(&spptr));
4041       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4042       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4043       spptr->format = MAT_CUSPARSE_CSR;
4044 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4045   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4046       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4047   #else
4048       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4049   #endif
4050       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4051       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4052 #endif
4053       B->spptr = spptr;
4054     } else {
4055       Mat_SeqAIJCUSPARSETriFactors *spptr;
4056 
4057       PetscCall(PetscNew(&spptr));
4058       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4059       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4060       B->spptr = spptr;
4061     }
4062     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4063   }
4064   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4065   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4066   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4067   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4068   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4069   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4070 
4071   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4072   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4073   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4074 #if defined(PETSC_HAVE_HYPRE)
4075   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4076 #endif
4077   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4078   PetscFunctionReturn(PETSC_SUCCESS);
4079 }
4080 
4081 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4082 {
4083   PetscFunctionBegin;
4084   PetscCall(MatCreate_SeqAIJ(B));
4085   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4086   PetscFunctionReturn(PETSC_SUCCESS);
4087 }
4088 
4089 /*MC
4090    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4091 
4092    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4093    CSR, ELL, or Hybrid format.
4094    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4095 
4096    Options Database Keys:
4097 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4098 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4099                                       Other options include ell (ellpack) or hyb (hybrid).
4100 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4101 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4102 
4103   Level: beginner
4104 
4105 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4106 M*/
4107 
4108 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4109 {
4110   PetscFunctionBegin;
4111   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4112   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4113   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4114   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4115   PetscFunctionReturn(PETSC_SUCCESS);
4116 }
4117 
4118 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4119 {
4120   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4121 
4122   PetscFunctionBegin;
4123   if (cusp) {
4124     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4125     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4126     delete cusp->workVector;
4127     delete cusp->rowoffsets_gpu;
4128     delete cusp->csr2csc_i;
4129     delete cusp->coords;
4130     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4131     PetscCall(PetscFree(mat->spptr));
4132   }
4133   PetscFunctionReturn(PETSC_SUCCESS);
4134 }
4135 
4136 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4137 {
4138   PetscFunctionBegin;
4139   if (*mat) {
4140     delete (*mat)->values;
4141     delete (*mat)->column_indices;
4142     delete (*mat)->row_offsets;
4143     delete *mat;
4144     *mat = 0;
4145   }
4146   PetscFunctionReturn(PETSC_SUCCESS);
4147 }
4148 
4149 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4150 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4151 {
4152   PetscFunctionBegin;
4153   if (*trifactor) {
4154     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4155     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4156     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4157     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4158     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4159   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4160     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4161   #endif
4162     PetscCall(PetscFree(*trifactor));
4163   }
4164   PetscFunctionReturn(PETSC_SUCCESS);
4165 }
4166 #endif
4167 
4168 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4169 {
4170   CsrMatrix *mat;
4171 
4172   PetscFunctionBegin;
4173   if (*matstruct) {
4174     if ((*matstruct)->mat) {
4175       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4177         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4178 #else
4179         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4180         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4181 #endif
4182       } else {
4183         mat = (CsrMatrix *)(*matstruct)->mat;
4184         PetscCall(CsrMatrix_Destroy(&mat));
4185       }
4186     }
4187     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4188     delete (*matstruct)->cprowIndices;
4189     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4190     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4191     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4192 
4193 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4194     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4195     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4196 
4197     for (int i = 0; i < 3; i++) {
4198       if (mdata->cuSpMV[i].initialized) {
4199         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4200         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4201         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4202   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4203         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4204         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4205   #endif
4206       }
4207     }
4208 #endif
4209     delete *matstruct;
4210     *matstruct = NULL;
4211   }
4212   PetscFunctionReturn(PETSC_SUCCESS);
4213 }
4214 
4215 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4216 {
4217   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4218 
4219   PetscFunctionBegin;
4220   if (fs) {
4221 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4222     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4223     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4224     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4225     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4226     delete fs->workVector;
4227     fs->workVector = NULL;
4228 #endif
4229     delete fs->rpermIndices;
4230     delete fs->cpermIndices;
4231     fs->rpermIndices  = NULL;
4232     fs->cpermIndices  = NULL;
4233     fs->init_dev_prop = PETSC_FALSE;
4234 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4235     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4236     PetscCallCUDA(cudaFree(fs->csrColIdx));
4237     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4238     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4239     PetscCallCUDA(cudaFree(fs->csrVal));
4240     PetscCallCUDA(cudaFree(fs->diag));
4241     PetscCallCUDA(cudaFree(fs->X));
4242     PetscCallCUDA(cudaFree(fs->Y));
4243     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4244     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4245     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4246     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4247     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4248     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4249     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4250     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4251     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4252     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4253     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4254     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4255     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4256     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4257     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4258     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4259     PetscCall(PetscFree(fs->csrRowPtr_h));
4260     PetscCall(PetscFree(fs->csrVal_h));
4261     PetscCall(PetscFree(fs->diag_h));
4262     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4263     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4264 #endif
4265   }
4266   PetscFunctionReturn(PETSC_SUCCESS);
4267 }
4268 
4269 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4270 {
4271   PetscFunctionBegin;
4272   if (*trifactors) {
4273     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4274     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4275     PetscCall(PetscFree(*trifactors));
4276   }
4277   PetscFunctionReturn(PETSC_SUCCESS);
4278 }
4279 
4280 struct IJCompare {
4281   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4282   {
4283     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4284     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4285     return false;
4286   }
4287 };
4288 
4289 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4290 {
4291   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4292 
4293   PetscFunctionBegin;
4294   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4295   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4296   if (destroy) {
4297     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4298     delete cusp->csr2csc_i;
4299     cusp->csr2csc_i = NULL;
4300   }
4301   A->transupdated = PETSC_FALSE;
4302   PetscFunctionReturn(PETSC_SUCCESS);
4303 }
4304 
4305 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4306 {
4307   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
4308 
4309   PetscFunctionBegin;
4310   PetscCallCUDA(cudaFree(coo->perm));
4311   PetscCallCUDA(cudaFree(coo->jmap));
4312   PetscCall(PetscFree(coo));
4313   PetscFunctionReturn(PETSC_SUCCESS);
4314 }
4315 
4316 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4317 {
4318   PetscBool            dev_ij = PETSC_FALSE;
4319   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4320   PetscInt            *i, *j;
4321   PetscContainer       container_h, container_d;
4322   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4323 
4324   PetscFunctionBegin;
4325   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4326   PetscCall(PetscGetMemType(coo_i, &mtype));
4327   if (PetscMemTypeDevice(mtype)) {
4328     dev_ij = PETSC_TRUE;
4329     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4330     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4331     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4332   } else {
4333     i = coo_i;
4334     j = coo_j;
4335   }
4336 
4337   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4338   if (dev_ij) PetscCall(PetscFree2(i, j));
4339   mat->offloadmask = PETSC_OFFLOAD_CPU;
4340   // Create the GPU memory
4341   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4342 
4343   // Copy the COO struct to device
4344   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4345   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4346   PetscCall(PetscMalloc1(1, &coo_d));
4347   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4348   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4349   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4350   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4351   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4352 
4353   // Put the COO struct in a container and then attach that to the matrix
4354   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
4355   PetscCall(PetscContainerSetPointer(container_d, coo_d));
4356   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4357   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
4358   PetscCall(PetscContainerDestroy(&container_d));
4359   PetscFunctionReturn(PETSC_SUCCESS);
4360 }
4361 
4362 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4363 {
4364   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4365   const PetscCount grid_size = gridDim.x * blockDim.x;
4366   for (; i < nnz; i += grid_size) {
4367     PetscScalar sum = 0.0;
4368     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4369     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4370   }
4371 }
4372 
4373 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4374 {
4375   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4376   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4377   PetscCount           Annz = seq->nz;
4378   PetscMemType         memtype;
4379   const PetscScalar   *v1 = v;
4380   PetscScalar         *Aa;
4381   PetscContainer       container;
4382   MatCOOStruct_SeqAIJ *coo;
4383 
4384   PetscFunctionBegin;
4385   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4386 
4387   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4388   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4389 
4390   PetscCall(PetscGetMemType(v, &memtype));
4391   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4392     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4393     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4394   }
4395 
4396   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4397   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4398 
4399   PetscCall(PetscLogGpuTimeBegin());
4400   if (Annz) {
4401     MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4402     PetscCallCUDA(cudaPeekAtLastError());
4403   }
4404   PetscCall(PetscLogGpuTimeEnd());
4405 
4406   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4407   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4408 
4409   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4410   PetscFunctionReturn(PETSC_SUCCESS);
4411 }
4412 
4413 /*@C
4414   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4415 
4416   Not Collective
4417 
4418   Input Parameters:
4419 + A          - the matrix
4420 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4421 
4422   Output Parameters:
4423 + i - the CSR row pointers
4424 - j - the CSR column indices
4425 
4426   Level: developer
4427 
4428   Note:
4429   When compressed is true, the CSR structure does not contain empty rows
4430 
4431 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4432 @*/
4433 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4434 {
4435   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4436   CsrMatrix          *csr;
4437   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4438 
4439   PetscFunctionBegin;
4440   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4441   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4442   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4443   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4444   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4445   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4446   csr = (CsrMatrix *)cusp->mat->mat;
4447   if (i) {
4448     if (!compressed && a->compressedrow.use) { /* need full row offset */
4449       if (!cusp->rowoffsets_gpu) {
4450         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4451         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4452         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4453       }
4454       *i = cusp->rowoffsets_gpu->data().get();
4455     } else *i = csr->row_offsets->data().get();
4456   }
4457   if (j) *j = csr->column_indices->data().get();
4458   PetscFunctionReturn(PETSC_SUCCESS);
4459 }
4460 
4461 /*@C
4462   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4463 
4464   Not Collective
4465 
4466   Input Parameters:
4467 + A          - the matrix
4468 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4469 . i          - the CSR row pointers
4470 - j          - the CSR column indices
4471 
4472   Level: developer
4473 
4474 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4475 @*/
4476 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4477 {
4478   PetscFunctionBegin;
4479   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4480   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4481   if (i) *i = NULL;
4482   if (j) *j = NULL;
4483   (void)compressed;
4484   PetscFunctionReturn(PETSC_SUCCESS);
4485 }
4486 
4487 /*@C
4488   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4489 
4490   Not Collective
4491 
4492   Input Parameter:
4493 . A - a `MATSEQAIJCUSPARSE` matrix
4494 
4495   Output Parameter:
4496 . a - pointer to the device data
4497 
4498   Level: developer
4499 
4500   Note:
4501   May trigger host-device copies if up-to-date matrix data is on host
4502 
4503 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4504 @*/
4505 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4506 {
4507   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4508   CsrMatrix          *csr;
4509 
4510   PetscFunctionBegin;
4511   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4512   PetscAssertPointer(a, 2);
4513   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4514   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4515   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4516   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4517   csr = (CsrMatrix *)cusp->mat->mat;
4518   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4519   *a = csr->values->data().get();
4520   PetscFunctionReturn(PETSC_SUCCESS);
4521 }
4522 
4523 /*@C
4524   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4525 
4526   Not Collective
4527 
4528   Input Parameters:
4529 + A - a `MATSEQAIJCUSPARSE` matrix
4530 - a - pointer to the device data
4531 
4532   Level: developer
4533 
4534 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4535 @*/
4536 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4537 {
4538   PetscFunctionBegin;
4539   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4540   PetscAssertPointer(a, 2);
4541   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4542   *a = NULL;
4543   PetscFunctionReturn(PETSC_SUCCESS);
4544 }
4545 
4546 /*@C
4547   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4548 
4549   Not Collective
4550 
4551   Input Parameter:
4552 . A - a `MATSEQAIJCUSPARSE` matrix
4553 
4554   Output Parameter:
4555 . a - pointer to the device data
4556 
4557   Level: developer
4558 
4559   Note:
4560   May trigger host-device copies if up-to-date matrix data is on host
4561 
4562 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4563 @*/
4564 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4565 {
4566   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4567   CsrMatrix          *csr;
4568 
4569   PetscFunctionBegin;
4570   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4571   PetscAssertPointer(a, 2);
4572   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4573   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4574   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4575   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4576   csr = (CsrMatrix *)cusp->mat->mat;
4577   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4578   *a             = csr->values->data().get();
4579   A->offloadmask = PETSC_OFFLOAD_GPU;
4580   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4581   PetscFunctionReturn(PETSC_SUCCESS);
4582 }
4583 /*@C
4584   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4585 
4586   Not Collective
4587 
4588   Input Parameters:
4589 + A - a `MATSEQAIJCUSPARSE` matrix
4590 - a - pointer to the device data
4591 
4592   Level: developer
4593 
4594 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4595 @*/
4596 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4597 {
4598   PetscFunctionBegin;
4599   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4600   PetscAssertPointer(a, 2);
4601   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4602   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4603   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4604   *a = NULL;
4605   PetscFunctionReturn(PETSC_SUCCESS);
4606 }
4607 
4608 /*@C
4609   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4610 
4611   Not Collective
4612 
4613   Input Parameter:
4614 . A - a `MATSEQAIJCUSPARSE` matrix
4615 
4616   Output Parameter:
4617 . a - pointer to the device data
4618 
4619   Level: developer
4620 
4621   Note:
4622   Does not trigger host-device copies and flags data validity on the GPU
4623 
4624 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4625 @*/
4626 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4627 {
4628   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4629   CsrMatrix          *csr;
4630 
4631   PetscFunctionBegin;
4632   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4633   PetscAssertPointer(a, 2);
4634   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4635   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4636   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4637   csr = (CsrMatrix *)cusp->mat->mat;
4638   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4639   *a             = csr->values->data().get();
4640   A->offloadmask = PETSC_OFFLOAD_GPU;
4641   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4642   PetscFunctionReturn(PETSC_SUCCESS);
4643 }
4644 
4645 /*@C
4646   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4647 
4648   Not Collective
4649 
4650   Input Parameters:
4651 + A - a `MATSEQAIJCUSPARSE` matrix
4652 - a - pointer to the device data
4653 
4654   Level: developer
4655 
4656 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4657 @*/
4658 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4659 {
4660   PetscFunctionBegin;
4661   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4662   PetscAssertPointer(a, 2);
4663   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4664   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4665   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4666   *a = NULL;
4667   PetscFunctionReturn(PETSC_SUCCESS);
4668 }
4669 
4670 struct IJCompare4 {
4671   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4672   {
4673     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4674     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4675     return false;
4676   }
4677 };
4678 
4679 struct Shift {
4680   int _shift;
4681 
4682   Shift(int shift) : _shift(shift) { }
4683   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4684 };
4685 
4686 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4687 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4688 {
4689   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4690   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4691   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4692   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4693   PetscInt                      Annz, Bnnz;
4694   cusparseStatus_t              stat;
4695   PetscInt                      i, m, n, zero = 0;
4696 
4697   PetscFunctionBegin;
4698   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4699   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4700   PetscAssertPointer(C, 4);
4701   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4702   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4703   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4704   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4705   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4706   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4707   if (reuse == MAT_INITIAL_MATRIX) {
4708     m = A->rmap->n;
4709     n = A->cmap->n + B->cmap->n;
4710     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4711     PetscCall(MatSetSizes(*C, m, n, m, n));
4712     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4713     c                       = (Mat_SeqAIJ *)(*C)->data;
4714     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4715     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4716     Ccsr                    = new CsrMatrix;
4717     Cmat->cprowIndices      = NULL;
4718     c->compressedrow.use    = PETSC_FALSE;
4719     c->compressedrow.nrows  = 0;
4720     c->compressedrow.i      = NULL;
4721     c->compressedrow.rindex = NULL;
4722     Ccusp->workVector       = NULL;
4723     Ccusp->nrows            = m;
4724     Ccusp->mat              = Cmat;
4725     Ccusp->mat->mat         = Ccsr;
4726     Ccsr->num_rows          = m;
4727     Ccsr->num_cols          = n;
4728     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4729     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4730     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4731     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4732     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4733     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4734     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4735     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4736     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4737     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4738     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4739     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4740     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4741 
4742     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4743     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4744     Annz                 = (PetscInt)Acsr->column_indices->size();
4745     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4746     c->nz                = Annz + Bnnz;
4747     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4748     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4749     Ccsr->values         = new THRUSTARRAY(c->nz);
4750     Ccsr->num_entries    = c->nz;
4751     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4752     if (c->nz) {
4753       auto              Acoo = new THRUSTINTARRAY32(Annz);
4754       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4755       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4756       THRUSTINTARRAY32 *Aroff, *Broff;
4757 
4758       if (a->compressedrow.use) { /* need full row offset */
4759         if (!Acusp->rowoffsets_gpu) {
4760           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4761           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4762           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4763         }
4764         Aroff = Acusp->rowoffsets_gpu;
4765       } else Aroff = Acsr->row_offsets;
4766       if (b->compressedrow.use) { /* need full row offset */
4767         if (!Bcusp->rowoffsets_gpu) {
4768           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4769           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4770           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4771         }
4772         Broff = Bcusp->rowoffsets_gpu;
4773       } else Broff = Bcsr->row_offsets;
4774       PetscCall(PetscLogGpuTimeBegin());
4775       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4776       PetscCallCUSPARSE(stat);
4777       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4778       PetscCallCUSPARSE(stat);
4779       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4780       auto Aperm = thrust::make_constant_iterator(1);
4781       auto Bperm = thrust::make_constant_iterator(0);
4782 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4783       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4784       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4785 #else
4786       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4787       auto Bcib = Bcsr->column_indices->begin();
4788       auto Bcie = Bcsr->column_indices->end();
4789       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4790 #endif
4791       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4792       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4793       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4794       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4795       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4796       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4797       auto p1    = Ccusp->coords->begin();
4798       auto p2    = Ccusp->coords->begin();
4799       thrust::advance(p2, Annz);
4800       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4801 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4802       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4803 #endif
4804       auto cci = thrust::make_counting_iterator(zero);
4805       auto cce = thrust::make_counting_iterator(c->nz);
4806 #if 0 //Errors on SUMMIT cuda 11.1.0
4807       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4808 #else
4809       auto pred = thrust::identity<int>();
4810       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4811       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4812 #endif
4813       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4814       PetscCallCUSPARSE(stat);
4815       PetscCall(PetscLogGpuTimeEnd());
4816       delete wPerm;
4817       delete Acoo;
4818       delete Bcoo;
4819       delete Ccoo;
4820 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4821       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4822       PetscCallCUSPARSE(stat);
4823 #endif
4824       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4825         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4826         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4827         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4828         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4829         CsrMatrix                    *CcsrT = new CsrMatrix;
4830         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4831         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4832 
4833         (*C)->form_explicit_transpose = PETSC_TRUE;
4834         (*C)->transupdated            = PETSC_TRUE;
4835         Ccusp->rowoffsets_gpu         = NULL;
4836         CmatT->cprowIndices           = NULL;
4837         CmatT->mat                    = CcsrT;
4838         CcsrT->num_rows               = n;
4839         CcsrT->num_cols               = m;
4840         CcsrT->num_entries            = c->nz;
4841 
4842         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4843         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4844         CcsrT->values         = new THRUSTARRAY(c->nz);
4845 
4846         PetscCall(PetscLogGpuTimeBegin());
4847         auto rT = CcsrT->row_offsets->begin();
4848         if (AT) {
4849           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4850           thrust::advance(rT, -1);
4851         }
4852         if (BT) {
4853           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4854           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4855           thrust::copy(titb, tite, rT);
4856         }
4857         auto cT = CcsrT->column_indices->begin();
4858         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4859         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4860         auto vT = CcsrT->values->begin();
4861         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4862         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4863         PetscCall(PetscLogGpuTimeEnd());
4864 
4865         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4866         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4867         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4868         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4869         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4870         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4871         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4872         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4873         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4874 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4875         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4876         PetscCallCUSPARSE(stat);
4877 #endif
4878         Ccusp->matTranspose = CmatT;
4879       }
4880     }
4881 
4882     c->singlemalloc = PETSC_FALSE;
4883     c->free_a       = PETSC_TRUE;
4884     c->free_ij      = PETSC_TRUE;
4885     PetscCall(PetscMalloc1(m + 1, &c->i));
4886     PetscCall(PetscMalloc1(c->nz, &c->j));
4887     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4888       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4889       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4890       ii = *Ccsr->row_offsets;
4891       jj = *Ccsr->column_indices;
4892       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4893       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4894     } else {
4895       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4896       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4897     }
4898     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4899     PetscCall(PetscMalloc1(m, &c->ilen));
4900     PetscCall(PetscMalloc1(m, &c->imax));
4901     c->maxnz         = c->nz;
4902     c->nonzerorowcnt = 0;
4903     c->rmax          = 0;
4904     for (i = 0; i < m; i++) {
4905       const PetscInt nn = c->i[i + 1] - c->i[i];
4906       c->ilen[i] = c->imax[i] = nn;
4907       c->nonzerorowcnt += (PetscInt) !!nn;
4908       c->rmax = PetscMax(c->rmax, nn);
4909     }
4910     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4911     PetscCall(PetscMalloc1(c->nz, &c->a));
4912     (*C)->nonzerostate++;
4913     PetscCall(PetscLayoutSetUp((*C)->rmap));
4914     PetscCall(PetscLayoutSetUp((*C)->cmap));
4915     Ccusp->nonzerostate = (*C)->nonzerostate;
4916     (*C)->preallocated  = PETSC_TRUE;
4917   } else {
4918     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4919     c = (Mat_SeqAIJ *)(*C)->data;
4920     if (c->nz) {
4921       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4922       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4923       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4924       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4925       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4926       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4927       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4928       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4929       Acsr = (CsrMatrix *)Acusp->mat->mat;
4930       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4931       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4932       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4933       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4934       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4935       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4936       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4937       auto pmid = Ccusp->coords->begin();
4938       thrust::advance(pmid, Acsr->num_entries);
4939       PetscCall(PetscLogGpuTimeBegin());
4940       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4941       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4942       thrust::for_each(zibait, zieait, VecCUDAEquals());
4943       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4944       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4945       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4946       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4947       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4948         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4949         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4950         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4951         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4952         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4953         auto       vT    = CcsrT->values->begin();
4954         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4955         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4956         (*C)->transupdated = PETSC_TRUE;
4957       }
4958       PetscCall(PetscLogGpuTimeEnd());
4959     }
4960   }
4961   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4962   (*C)->assembled     = PETSC_TRUE;
4963   (*C)->was_assembled = PETSC_FALSE;
4964   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4965   PetscFunctionReturn(PETSC_SUCCESS);
4966 }
4967 
4968 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4969 {
4970   bool               dmem;
4971   const PetscScalar *av;
4972 
4973   PetscFunctionBegin;
4974   dmem = isCudaMem(v);
4975   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4976   if (n && idx) {
4977     THRUSTINTARRAY widx(n);
4978     widx.assign(idx, idx + n);
4979     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4980 
4981     THRUSTARRAY                    *w = NULL;
4982     thrust::device_ptr<PetscScalar> dv;
4983     if (dmem) {
4984       dv = thrust::device_pointer_cast(v);
4985     } else {
4986       w  = new THRUSTARRAY(n);
4987       dv = w->data();
4988     }
4989     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4990 
4991     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4992     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4993     thrust::for_each(zibit, zieit, VecCUDAEquals());
4994     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4995     delete w;
4996   } else {
4997     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4998   }
4999   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5000   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5001   PetscFunctionReturn(PETSC_SUCCESS);
5002 }
5003