xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision aa9a5b67dde7169d6e428ea38b8433e2afcb1019)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73 #endif
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
89 
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
92 
93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96 
97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98 {
99   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
100 
101   PetscFunctionBegin;
102   switch (op) {
103   case MAT_CUSPARSE_MULT:
104     cusparsestruct->format = format;
105     break;
106   case MAT_CUSPARSE_ALL:
107     cusparsestruct->format = format;
108     break;
109   default:
110     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111   }
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
115 /*@
116   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
117   operation. Only the `MatMult()` operation can use different GPU storage formats
118 
119   Not Collective
120 
121   Input Parameters:
122 + A      - Matrix of type `MATSEQAIJCUSPARSE`
123 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
124         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126 
127   Level: intermediate
128 
129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130 @*/
131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132 {
133   PetscFunctionBegin;
134   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140 {
141   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142 
143   PetscFunctionBegin;
144   cusparsestruct->use_cpu_solve = use_cpu;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150 
151   Input Parameters:
152 + A       - Matrix of type `MATSEQAIJCUSPARSE`
153 - use_cpu - set flag for using the built-in CPU `MatSolve()`
154 
155   Level: intermediate
156 
157   Note:
158   The cuSparse LU solver currently computes the factors with the built-in CPU method
159   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161 
162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163 @*/
164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165 {
166   PetscFunctionBegin;
167   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173 {
174   PetscFunctionBegin;
175   switch (op) {
176   case MAT_FORM_EXPLICIT_TRANSPOSE:
177     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
178     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
179     A->form_explicit_transpose = flg;
180     break;
181   default:
182     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183     break;
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
189 {
190   MatCUSPARSEStorageFormat format;
191   PetscBool                flg;
192   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
193 
194   PetscFunctionBegin;
195   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
196   if (A->factortype == MAT_FACTOR_NONE) {
197     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
198     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199 
200     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
201     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
202     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
203     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
205     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209   #else
210     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211   #endif
212     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214 
215     PetscCall(
216       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218 #endif
219   }
220   PetscOptionsHeadEnd();
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226 {
227   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228   PetscInt                      m  = A->rmap->n;
229   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231   const MatScalar              *Aa = a->a;
232   PetscInt                     *Mi, *Mj, Mnz;
233   PetscScalar                  *Ma;
234 
235   PetscFunctionBegin;
236   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240       PetscCall(PetscMalloc1(m + 1, &Mi));
241       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242       PetscCall(PetscMalloc1(Mnz, &Ma));
243       Mi[0] = 0;
244       for (PetscInt i = 0; i < m; i++) {
245         PetscInt llen = Ai[i + 1] - Ai[i];
246         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250         Mi[i + 1] = Mi[i] + llen + ulen;
251       }
252       // Copy M (L,U) from host to device
253       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258 
259       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267 
268       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271 
272       fillMode = CUSPARSE_FILL_MODE_UPPER;
273       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277 
278       // Allocate work vectors in SpSv
279       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281 
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284 
285       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292 
293       // Record for reuse
294       fs->csrRowPtr_h = Mi;
295       fs->csrVal_h    = Ma;
296       PetscCall(PetscFree(Mj));
297     }
298     // Copy the value
299     Mi  = fs->csrRowPtr_h;
300     Ma  = fs->csrVal_h;
301     Mnz = Mi[m];
302     for (PetscInt i = 0; i < m; i++) {
303       PetscInt llen = Ai[i + 1] - Ai[i];
304       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308     }
309     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310 
311     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
312     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
313 
314     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
315 
316     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
317     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
318   }
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 #else
322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
323 {
324   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
325   PetscInt                           n                  = A->rmap->n;
326   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
327   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
328   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
329   const MatScalar                   *aa = a->a, *v;
330   PetscInt                          *AiLo, *AjLo;
331   PetscInt                           i, nz, nzLower, offset, rowOffset;
332 
333   PetscFunctionBegin;
334   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
335   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
336     try {
337       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
338       nzLower = n + ai[n] - ai[1];
339       if (!loTriFactor) {
340         PetscScalar *AALo;
341 
342         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
343 
344         /* Allocate Space for the lower triangular matrix */
345         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
346         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
347 
348         /* Fill the lower triangular matrix */
349         AiLo[0]   = (PetscInt)0;
350         AiLo[n]   = nzLower;
351         AjLo[0]   = (PetscInt)0;
352         AALo[0]   = (MatScalar)1.0;
353         v         = aa;
354         vi        = aj;
355         offset    = 1;
356         rowOffset = 1;
357         for (i = 1; i < n; i++) {
358           nz = ai[i + 1] - ai[i];
359           /* additional 1 for the term on the diagonal */
360           AiLo[i] = rowOffset;
361           rowOffset += nz + 1;
362 
363           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
364           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
365 
366           offset += nz;
367           AjLo[offset] = (PetscInt)i;
368           AALo[offset] = (MatScalar)1.0;
369           offset += 1;
370 
371           v += nz;
372           vi += nz;
373         }
374 
375         /* allocate space for the triangular factor information */
376         PetscCall(PetscNew(&loTriFactor));
377         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
378         /* Create the matrix description */
379         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
380         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
381   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
382         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
383   #else
384         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
385   #endif
386         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
387         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
388 
389         /* set the operation */
390         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
391 
392         /* set the matrix */
393         loTriFactor->csrMat              = new CsrMatrix;
394         loTriFactor->csrMat->num_rows    = n;
395         loTriFactor->csrMat->num_cols    = n;
396         loTriFactor->csrMat->num_entries = nzLower;
397 
398         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
399         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
400 
401         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
402         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
403 
404         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
405         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
406 
407         /* Create the solve analysis information */
408         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
409         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
410   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
411         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
412                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
413         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
414   #endif
415 
416         /* perform the solve analysis */
417         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
418                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
419         PetscCallCUDA(WaitForCUDA());
420         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
421 
422         /* assign the pointer */
423         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
424         loTriFactor->AA_h                                          = AALo;
425         PetscCallCUDA(cudaFreeHost(AiLo));
426         PetscCallCUDA(cudaFreeHost(AjLo));
427         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
428       } else { /* update values only */
429         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
430         /* Fill the lower triangular matrix */
431         loTriFactor->AA_h[0] = 1.0;
432         v                    = aa;
433         vi                   = aj;
434         offset               = 1;
435         for (i = 1; i < n; i++) {
436           nz = ai[i + 1] - ai[i];
437           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
438           offset += nz;
439           loTriFactor->AA_h[offset] = 1.0;
440           offset += 1;
441           v += nz;
442         }
443         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
444         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
445       }
446     } catch (char *ex) {
447       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
448     }
449   }
450   PetscFunctionReturn(PETSC_SUCCESS);
451 }
452 
453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
454 {
455   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
456   PetscInt                           n                  = A->rmap->n;
457   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
458   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
459   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
460   const MatScalar                   *aa = a->a, *v;
461   PetscInt                          *AiUp, *AjUp;
462   PetscInt                           i, nz, nzUpper, offset;
463 
464   PetscFunctionBegin;
465   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
466   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
467     try {
468       /* next, figure out the number of nonzeros in the upper triangular matrix. */
469       nzUpper = adiag[0] - adiag[n];
470       if (!upTriFactor) {
471         PetscScalar *AAUp;
472 
473         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
474 
475         /* Allocate Space for the upper triangular matrix */
476         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
477         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
478 
479         /* Fill the upper triangular matrix */
480         AiUp[0] = (PetscInt)0;
481         AiUp[n] = nzUpper;
482         offset  = nzUpper;
483         for (i = n - 1; i >= 0; i--) {
484           v  = aa + adiag[i + 1] + 1;
485           vi = aj + adiag[i + 1] + 1;
486 
487           /* number of elements NOT on the diagonal */
488           nz = adiag[i] - adiag[i + 1] - 1;
489 
490           /* decrement the offset */
491           offset -= (nz + 1);
492 
493           /* first, set the diagonal elements */
494           AjUp[offset] = (PetscInt)i;
495           AAUp[offset] = (MatScalar)1. / v[nz];
496           AiUp[i]      = AiUp[i + 1] - (nz + 1);
497 
498           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
499           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
500         }
501 
502         /* allocate space for the triangular factor information */
503         PetscCall(PetscNew(&upTriFactor));
504         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
505 
506         /* Create the matrix description */
507         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
508         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
509   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
510         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
511   #else
512         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
513   #endif
514         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
515         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516 
517         /* set the operation */
518         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519 
520         /* set the matrix */
521         upTriFactor->csrMat              = new CsrMatrix;
522         upTriFactor->csrMat->num_rows    = n;
523         upTriFactor->csrMat->num_cols    = n;
524         upTriFactor->csrMat->num_entries = nzUpper;
525 
526         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
527         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
528 
529         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
530         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
531 
532         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
533         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
534 
535         /* Create the solve analysis information */
536         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
537         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
538   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
539         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
540                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
541         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
542   #endif
543 
544         /* perform the solve analysis */
545         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
546                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
547 
548         PetscCallCUDA(WaitForCUDA());
549         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
550 
551         /* assign the pointer */
552         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
553         upTriFactor->AA_h                                          = AAUp;
554         PetscCallCUDA(cudaFreeHost(AiUp));
555         PetscCallCUDA(cudaFreeHost(AjUp));
556         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
557       } else {
558         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
559         /* Fill the upper triangular matrix */
560         offset = nzUpper;
561         for (i = n - 1; i >= 0; i--) {
562           v = aa + adiag[i + 1] + 1;
563 
564           /* number of elements NOT on the diagonal */
565           nz = adiag[i] - adiag[i + 1] - 1;
566 
567           /* decrement the offset */
568           offset -= (nz + 1);
569 
570           /* first, set the diagonal elements */
571           upTriFactor->AA_h[offset] = 1. / v[nz];
572           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
573         }
574         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
575         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
576       }
577     } catch (char *ex) {
578       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
579     }
580   }
581   PetscFunctionReturn(PETSC_SUCCESS);
582 }
583 #endif
584 
585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
586 {
587   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
588   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
589   IS                            isrow = a->row, isicol = a->icol;
590   PetscBool                     row_identity, col_identity;
591   PetscInt                      n = A->rmap->n;
592 
593   PetscFunctionBegin;
594   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
596   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
597 #else
598   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
599   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
600   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
601 #endif
602 
603   cusparseTriFactors->nnz = a->nz;
604 
605   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
606   /* lower triangular indices */
607   PetscCall(ISIdentity(isrow, &row_identity));
608   if (!row_identity && !cusparseTriFactors->rpermIndices) {
609     const PetscInt *r;
610 
611     PetscCall(ISGetIndices(isrow, &r));
612     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
613     cusparseTriFactors->rpermIndices->assign(r, r + n);
614     PetscCall(ISRestoreIndices(isrow, &r));
615     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
616   }
617 
618   /* upper triangular indices */
619   PetscCall(ISIdentity(isicol, &col_identity));
620   if (!col_identity && !cusparseTriFactors->cpermIndices) {
621     const PetscInt *c;
622 
623     PetscCall(ISGetIndices(isicol, &c));
624     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
625     cusparseTriFactors->cpermIndices->assign(c, c + n);
626     PetscCall(ISRestoreIndices(isicol, &c));
627     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
628   }
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
634 {
635   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
636   PetscInt                      m  = A->rmap->n;
637   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
638   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
639   const MatScalar              *Aa = a->a;
640   PetscInt                     *Mj, Mnz;
641   PetscScalar                  *Ma, *D;
642 
643   PetscFunctionBegin;
644   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
645     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
646       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
647       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
648       Mnz = Ai[m]; // Unz (with the unit diagonal)
649       PetscCall(PetscMalloc1(Mnz, &Ma));
650       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
651       PetscCall(PetscMalloc1(m, &D));    // the diagonal
652       for (PetscInt i = 0; i < m; i++) {
653         PetscInt ulen = Ai[i + 1] - Ai[i];
654         Mj[Ai[i]]     = i;                                              // diagonal entry
655         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
656       }
657       // Copy M (U) from host to device
658       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
659       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
661       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
662       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
663       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
664 
665       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
666       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
667       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
668       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
669       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
670       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
671       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
672       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
673 
674       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
676       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
677 
678       // Allocate work vectors in SpSv
679       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
680       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
681 
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
683       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
684 
685       // Query buffer sizes for SpSV and then allocate buffers
686       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
687       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
688       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
689 
690       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
691       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
692       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
693 
694       // Record for reuse
695       fs->csrVal_h = Ma;
696       fs->diag_h   = D;
697       PetscCall(PetscFree(Mj));
698     }
699     // Copy the value
700     Ma  = fs->csrVal_h;
701     D   = fs->diag_h;
702     Mnz = Ai[m];
703     for (PetscInt i = 0; i < m; i++) {
704       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
705       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
706       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
707     }
708     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
709     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
710 
711     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
713     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
714   }
715   PetscFunctionReturn(PETSC_SUCCESS);
716 }
717 
718 // Solve Ut D U x = b
719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
720 {
721   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
722   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
723   const PetscScalar                    *barray;
724   PetscScalar                          *xarray;
725   thrust::device_ptr<const PetscScalar> bGPU;
726   thrust::device_ptr<PetscScalar>       xGPU;
727   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
728   PetscInt                              m   = A->rmap->n;
729 
730   PetscFunctionBegin;
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
733   PetscCall(VecCUDAGetArrayRead(b, &barray));
734   xGPU = thrust::device_pointer_cast(xarray);
735   bGPU = thrust::device_pointer_cast(barray);
736 
737   // Reorder b with the row permutation if needed, and wrap the result in fs->X
738   if (fs->rpermIndices) {
739     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
740     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
741   } else {
742     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
743   }
744 
745   // Solve Ut Y = X
746   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
747   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
748 
749   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
750   // It is basically a vector element-wise multiplication, but cublas does not have it!
751   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
752 
753   // Solve U X = Y
754   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756   } else {
757     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758   }
759   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
760 
761   // Reorder X with the column permutation if needed, and put the result back to x
762   if (fs->cpermIndices) {
763     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765   }
766 
767   PetscCall(VecCUDARestoreArrayRead(b, &barray));
768   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769   PetscCall(PetscLogGpuTimeEnd());
770   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 #else
774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775 {
776   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
777   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780   PetscInt                          *AiUp, *AjUp;
781   PetscScalar                       *AAUp;
782   PetscScalar                       *AALo;
783   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
785   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
786   const MatScalar                   *aa = b->a, *v;
787 
788   PetscFunctionBegin;
789   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791     try {
792       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
793       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794       if (!upTriFactor && !loTriFactor) {
795         /* Allocate Space for the upper triangular matrix */
796         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
797         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
798 
799         /* Fill the upper triangular matrix */
800         AiUp[0] = (PetscInt)0;
801         AiUp[n] = nzUpper;
802         offset  = 0;
803         for (i = 0; i < n; i++) {
804           /* set the pointers */
805           v  = aa + ai[i];
806           vj = aj + ai[i];
807           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
808 
809           /* first, set the diagonal elements */
810           AjUp[offset] = (PetscInt)i;
811           AAUp[offset] = (MatScalar)1.0 / v[nz];
812           AiUp[i]      = offset;
813           AALo[offset] = (MatScalar)1.0 / v[nz];
814 
815           offset += 1;
816           if (nz > 0) {
817             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819             for (j = offset; j < offset + nz; j++) {
820               AAUp[j] = -AAUp[j];
821               AALo[j] = AAUp[j] / v[nz];
822             }
823             offset += nz;
824           }
825         }
826 
827         /* allocate space for the triangular factor information */
828         PetscCall(PetscNew(&upTriFactor));
829         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
830 
831         /* Create the matrix description */
832         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
833         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
834   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
835         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836   #else
837         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838   #endif
839         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
840         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
841 
842         /* set the matrix */
843         upTriFactor->csrMat              = new CsrMatrix;
844         upTriFactor->csrMat->num_rows    = A->rmap->n;
845         upTriFactor->csrMat->num_cols    = A->cmap->n;
846         upTriFactor->csrMat->num_entries = a->nz;
847 
848         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
850 
851         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
853 
854         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
856 
857         /* set the operation */
858         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859 
860         /* Create the solve analysis information */
861         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
863   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
864         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
865                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
866         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867   #endif
868 
869         /* perform the solve analysis */
870         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
871                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
872 
873         PetscCallCUDA(WaitForCUDA());
874         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
875 
876         /* assign the pointer */
877         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
878 
879         /* allocate space for the triangular factor information */
880         PetscCall(PetscNew(&loTriFactor));
881         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
882 
883         /* Create the matrix description */
884         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
885         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
886   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888   #else
889         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890   #endif
891         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
892         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
893 
894         /* set the operation */
895         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
896 
897         /* set the matrix */
898         loTriFactor->csrMat              = new CsrMatrix;
899         loTriFactor->csrMat->num_rows    = A->rmap->n;
900         loTriFactor->csrMat->num_cols    = A->cmap->n;
901         loTriFactor->csrMat->num_entries = a->nz;
902 
903         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
905 
906         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
908 
909         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
911 
912         /* Create the solve analysis information */
913         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
915   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
916         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
917                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
918         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919   #endif
920 
921         /* perform the solve analysis */
922         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
923                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
924 
925         PetscCallCUDA(WaitForCUDA());
926         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
927 
928         /* assign the pointer */
929         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
930 
931         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
932         PetscCallCUDA(cudaFreeHost(AiUp));
933         PetscCallCUDA(cudaFreeHost(AjUp));
934       } else {
935         /* Fill the upper triangular matrix */
936         offset = 0;
937         for (i = 0; i < n; i++) {
938           /* set the pointers */
939           v  = aa + ai[i];
940           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
941 
942           /* first, set the diagonal elements */
943           AAUp[offset] = 1.0 / v[nz];
944           AALo[offset] = 1.0 / v[nz];
945 
946           offset += 1;
947           if (nz > 0) {
948             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949             for (j = offset; j < offset + nz; j++) {
950               AAUp[j] = -AAUp[j];
951               AALo[j] = AAUp[j] / v[nz];
952             }
953             offset += nz;
954           }
955         }
956         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
960         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961       }
962       PetscCallCUDA(cudaFreeHost(AAUp));
963       PetscCallCUDA(cudaFreeHost(AALo));
964     } catch (char *ex) {
965       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966     }
967   }
968   PetscFunctionReturn(PETSC_SUCCESS);
969 }
970 #endif
971 
972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973 {
974   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
975   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976   IS                            ip                 = a->row;
977   PetscBool                     perm_identity;
978   PetscInt                      n = A->rmap->n;
979 
980   PetscFunctionBegin;
981   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
982 
983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
985 #else
986   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988 #endif
989   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
990 
991   A->offloadmask = PETSC_OFFLOAD_BOTH;
992 
993   /* lower triangular indices */
994   PetscCall(ISIdentity(ip, &perm_identity));
995   if (!perm_identity) {
996     IS              iip;
997     const PetscInt *irip, *rip;
998 
999     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1000     PetscCall(ISGetIndices(iip, &irip));
1001     PetscCall(ISGetIndices(ip, &rip));
1002     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1005     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1006     PetscCall(ISRestoreIndices(iip, &irip));
1007     PetscCall(ISDestroy(&iip));
1008     PetscCall(ISRestoreIndices(ip, &rip));
1009     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010   }
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015 {
1016   PetscFunctionBegin;
1017   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1018   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019   B->offloadmask = PETSC_OFFLOAD_CPU;
1020 
1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024 #else
1025   /* determine which version of MatSolve needs to be used. */
1026   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1027   IS          ip = b->row;
1028   PetscBool   perm_identity;
1029 
1030   PetscCall(ISIdentity(ip, &perm_identity));
1031   if (perm_identity) {
1032     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034   } else {
1035     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1036     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037   }
1038 #endif
1039   B->ops->matsolve          = NULL;
1040   B->ops->matsolvetranspose = NULL;
1041 
1042   /* get the triangular factors */
1043   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1044   PetscFunctionReturn(PETSC_SUCCESS);
1045 }
1046 
1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049 {
1050   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055   cusparseIndexBase_t                indexBase;
1056   cusparseMatrixType_t               matrixType;
1057   cusparseFillMode_t                 fillMode;
1058   cusparseDiagType_t                 diagType;
1059 
1060   PetscFunctionBegin;
1061   /* allocate space for the transpose of the lower triangular factor */
1062   PetscCall(PetscNew(&loTriFactorT));
1063   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1064 
1065   /* set the matrix descriptors of the lower triangular factor */
1066   matrixType = cusparseGetMatType(loTriFactor->descr);
1067   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1068   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1070 
1071   /* Create the matrix description */
1072   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1073   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1074   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1075   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1076   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1077 
1078   /* set the operation */
1079   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1080 
1081   /* allocate GPU space for the CSC of the lower triangular factor*/
1082   loTriFactorT->csrMat                 = new CsrMatrix;
1083   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1084   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1085   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1086   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1089 
1090   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1092   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1093                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1095   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096   #endif
1097 
1098   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1099   {
1100     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1101     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1102                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1104                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105   #else
1106                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107   #endif
1108     PetscCallCUSPARSE(stat);
1109   }
1110 
1111   PetscCallCUDA(WaitForCUDA());
1112   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1113 
1114   /* Create the solve analysis information */
1115   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1117   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1118   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1119                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1120   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121   #endif
1122 
1123   /* perform the solve analysis */
1124   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1125                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1126 
1127   PetscCallCUDA(WaitForCUDA());
1128   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1129 
1130   /* assign the pointer */
1131   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1132 
1133   /*********************************************/
1134   /* Now the Transpose of the Upper Tri Factor */
1135   /*********************************************/
1136 
1137   /* allocate space for the transpose of the upper triangular factor */
1138   PetscCall(PetscNew(&upTriFactorT));
1139   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1140 
1141   /* set the matrix descriptors of the upper triangular factor */
1142   matrixType = cusparseGetMatType(upTriFactor->descr);
1143   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1144   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1146 
1147   /* Create the matrix description */
1148   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1149   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1150   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1151   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1152   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1153 
1154   /* set the operation */
1155   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1156 
1157   /* allocate GPU space for the CSC of the upper triangular factor*/
1158   upTriFactorT->csrMat                 = new CsrMatrix;
1159   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1160   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1161   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1162   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1165 
1166   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1168   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1169                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1171   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172   #endif
1173 
1174   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1175   {
1176     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1177     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1178                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1180                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181   #else
1182                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183   #endif
1184     PetscCallCUSPARSE(stat);
1185   }
1186 
1187   PetscCallCUDA(WaitForCUDA());
1188   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1189 
1190   /* Create the solve analysis information */
1191   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1193   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1194   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1195                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1196   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197   #endif
1198 
1199   /* perform the solve analysis */
1200   /* christ, would it have killed you to put this stuff in a function????????? */
1201   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1202                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1203 
1204   PetscCallCUDA(WaitForCUDA());
1205   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1206 
1207   /* assign the pointer */
1208   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 #endif
1212 
1213 struct PetscScalarToPetscInt {
1214   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215 };
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1222   cusparseStatus_t              stat;
1223   cusparseIndexBase_t           indexBase;
1224 
1225   PetscFunctionBegin;
1226   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1228   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1230   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1231   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1232   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1233   PetscCall(PetscLogGpuTimeBegin());
1234   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1240     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1241 
1242     /* set alpha and beta */
1243     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1246     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1249 
1250     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251       CsrMatrix *matrixT      = new CsrMatrix;
1252       matstructT->mat         = matrixT;
1253       matrixT->num_rows       = A->cmap->n;
1254       matrixT->num_cols       = A->rmap->n;
1255       matrixT->num_entries    = a->nz;
1256       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258       matrixT->values         = new THRUSTARRAY(a->nz);
1259 
1260       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1261       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1262 
1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1264   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1265       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266                                indexBase, cusparse_scalartype);
1267       PetscCallCUSPARSE(stat);
1268   #else
1269       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1270            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1271 
1272            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1273            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1274            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1275         */
1276       if (matrixT->num_entries) {
1277         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1278         PetscCallCUSPARSE(stat);
1279 
1280       } else {
1281         matstructT->matDescr = NULL;
1282         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1283       }
1284   #endif
1285 #endif
1286     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289 #else
1290       CsrMatrix *temp  = new CsrMatrix;
1291       CsrMatrix *tempT = new CsrMatrix;
1292       /* First convert HYB to CSR */
1293       temp->num_rows       = A->rmap->n;
1294       temp->num_cols       = A->cmap->n;
1295       temp->num_entries    = a->nz;
1296       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1297       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298       temp->values         = new THRUSTARRAY(a->nz);
1299 
1300       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1301       PetscCallCUSPARSE(stat);
1302 
1303       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304       tempT->num_rows       = A->rmap->n;
1305       tempT->num_cols       = A->cmap->n;
1306       tempT->num_entries    = a->nz;
1307       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1308       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309       tempT->values         = new THRUSTARRAY(a->nz);
1310 
1311       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1312                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1313       PetscCallCUSPARSE(stat);
1314 
1315       /* Last, convert CSC to HYB */
1316       cusparseHybMat_t hybMat;
1317       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1319       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1320       PetscCallCUSPARSE(stat);
1321 
1322       /* assign the pointer */
1323       matstructT->mat = hybMat;
1324       A->transupdated = PETSC_TRUE;
1325       /* delete temporaries */
1326       if (tempT) {
1327         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330         delete (CsrMatrix *)tempT;
1331       }
1332       if (temp) {
1333         if (temp->values) delete (THRUSTARRAY *)temp->values;
1334         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336         delete (CsrMatrix *)temp;
1337       }
1338 #endif
1339     }
1340   }
1341   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1343     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1344     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1345     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1346     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1347     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1348     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1349     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1350     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1351     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1355       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356     }
1357     if (!cusparsestruct->csr2csc_i) {
1358       THRUSTARRAY csr2csc_a(matrix->num_entries);
1359       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1360 
1361       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363       void  *csr2cscBuffer;
1364       size_t csr2cscBufferSize;
1365       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1366                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1367       PetscCallCUSPARSE(stat);
1368       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369 #endif
1370 
1371       if (matrix->num_entries) {
1372         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375 
1376            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377            should be filled with indexBase. So I just take a shortcut here.
1378         */
1379         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1382         PetscCallCUSPARSE(stat);
1383 #else
1384                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1385         PetscCallCUSPARSE(stat);
1386 #endif
1387       } else {
1388         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1389       }
1390 
1391       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1394       PetscCallCUDA(cudaFree(csr2cscBuffer));
1395 #endif
1396     }
1397     PetscCallThrust(
1398       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399   }
1400   PetscCall(PetscLogGpuTimeEnd());
1401   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402   /* the compressed row indices is not used for matTranspose */
1403   matstructT->cprowIndices = NULL;
1404   /* assign the pointer */
1405   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1406   A->transupdated                                = PETSC_TRUE;
1407   PetscFunctionReturn(PETSC_SUCCESS);
1408 }
1409 
1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412 {
1413   const PetscScalar                    *barray;
1414   PetscScalar                          *xarray;
1415   thrust::device_ptr<const PetscScalar> bGPU;
1416   thrust::device_ptr<PetscScalar>       xGPU;
1417   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421   PetscInt                              m   = A->rmap->n;
1422 
1423   PetscFunctionBegin;
1424   PetscCall(PetscLogGpuTimeBegin());
1425   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426   PetscCall(VecCUDAGetArrayRead(b, &barray));
1427   xGPU = thrust::device_pointer_cast(xarray);
1428   bGPU = thrust::device_pointer_cast(barray);
1429 
1430   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431   if (fs->rpermIndices) {
1432     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434   } else {
1435     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436   }
1437 
1438   // Solve L Y = X
1439   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1442 
1443   // Solve U X = Y
1444   if (fs->cpermIndices) {
1445     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446   } else {
1447     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448   }
1449   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1450 
1451   // Reorder X with the column permutation if needed, and put the result back to x
1452   if (fs->cpermIndices) {
1453     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455   }
1456   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458   PetscCall(PetscLogGpuTimeEnd());
1459   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464 {
1465   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467   const PetscScalar                    *barray;
1468   PetscScalar                          *xarray;
1469   thrust::device_ptr<const PetscScalar> bGPU;
1470   thrust::device_ptr<PetscScalar>       xGPU;
1471   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1472   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473   PetscInt                              m   = A->rmap->n;
1474 
1475   PetscFunctionBegin;
1476   PetscCall(PetscLogGpuTimeBegin());
1477   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1481 
1482     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487   }
1488 
1489   if (!fs->updatedTransposeSpSVAnalysis) {
1490     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1491 
1492     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494   }
1495 
1496   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497   PetscCall(VecCUDAGetArrayRead(b, &barray));
1498   xGPU = thrust::device_pointer_cast(xarray);
1499   bGPU = thrust::device_pointer_cast(barray);
1500 
1501   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502   if (fs->rpermIndices) {
1503     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505   } else {
1506     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507   }
1508 
1509   // Solve Ut Y = X
1510   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1512 
1513   // Solve Lt X = Y
1514   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516   } else {
1517     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518   }
1519   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1520 
1521   // Reorder X with the column permutation if needed, and put the result back to x
1522   if (fs->cpermIndices) {
1523     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525   }
1526 
1527   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529   PetscCall(PetscLogGpuTimeEnd());
1530   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 #else
1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536 {
1537   PetscInt                              n = xx->map->n;
1538   const PetscScalar                    *barray;
1539   PetscScalar                          *xarray;
1540   thrust::device_ptr<const PetscScalar> bGPU;
1541   thrust::device_ptr<PetscScalar>       xGPU;
1542   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1546 
1547   PetscFunctionBegin;
1548   /* Analyze the matrix and create the transpose ... on the fly */
1549   if (!loTriFactorT && !upTriFactorT) {
1550     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553   }
1554 
1555   /* Get the GPU pointers */
1556   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1557   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558   xGPU = thrust::device_pointer_cast(xarray);
1559   bGPU = thrust::device_pointer_cast(barray);
1560 
1561   PetscCall(PetscLogGpuTimeBegin());
1562   /* First, reorder with the row permutation */
1563   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1564 
1565   /* First, solve U */
1566   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1567                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1568 
1569   /* Then, solve L */
1570   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1571                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1572 
1573   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1575 
1576   /* Copy the temporary to the full solution. */
1577   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1578 
1579   /* restore */
1580   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1581   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1582   PetscCall(PetscLogGpuTimeEnd());
1583   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1584   PetscFunctionReturn(PETSC_SUCCESS);
1585 }
1586 
1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588 {
1589   const PetscScalar                 *barray;
1590   PetscScalar                       *xarray;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1595 
1596   PetscFunctionBegin;
1597   /* Analyze the matrix and create the transpose ... on the fly */
1598   if (!loTriFactorT && !upTriFactorT) {
1599     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602   }
1603 
1604   /* Get the GPU pointers */
1605   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1606   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1607 
1608   PetscCall(PetscLogGpuTimeBegin());
1609   /* First, solve U */
1610   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1611                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1612 
1613   /* Then, solve L */
1614   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1615                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1616 
1617   /* restore */
1618   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1619   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1620   PetscCall(PetscLogGpuTimeEnd());
1621   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1622   PetscFunctionReturn(PETSC_SUCCESS);
1623 }
1624 
1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626 {
1627   const PetscScalar                    *barray;
1628   PetscScalar                          *xarray;
1629   thrust::device_ptr<const PetscScalar> bGPU;
1630   thrust::device_ptr<PetscScalar>       xGPU;
1631   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637   /* Get the GPU pointers */
1638   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1639   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640   xGPU = thrust::device_pointer_cast(xarray);
1641   bGPU = thrust::device_pointer_cast(barray);
1642 
1643   PetscCall(PetscLogGpuTimeBegin());
1644   /* First, reorder with the row permutation */
1645   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1646 
1647   /* Next, solve L */
1648   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1649                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1650 
1651   /* Then, solve U */
1652   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1653                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1654 
1655   /* Last, reorder with the column permutation */
1656   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1657 
1658   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1659   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1660   PetscCall(PetscLogGpuTimeEnd());
1661   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666 {
1667   const PetscScalar                 *barray;
1668   PetscScalar                       *xarray;
1669   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1673 
1674   PetscFunctionBegin;
1675   /* Get the GPU pointers */
1676   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1677   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1678 
1679   PetscCall(PetscLogGpuTimeBegin());
1680   /* First, solve L */
1681   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1682                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1683 
1684   /* Next, solve U */
1685   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1686                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1687 
1688   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1689   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1690   PetscCall(PetscLogGpuTimeEnd());
1691   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1692   PetscFunctionReturn(PETSC_SUCCESS);
1693 }
1694 #endif
1695 
1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698 {
1699   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1701   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702   CsrMatrix                    *Acsr;
1703   PetscInt                      m, nz;
1704   PetscBool                     flg;
1705 
1706   PetscFunctionBegin;
1707   if (PetscDefined(USE_DEBUG)) {
1708     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710   }
1711 
1712   /* Copy A's value to fact */
1713   m  = fact->rmap->n;
1714   nz = aij->nz;
1715   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716   Acsr = (CsrMatrix *)Acusp->mat->mat;
1717   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1718 
1719   PetscCall(PetscLogGpuTimeBegin());
1720   /* Factorize fact inplace */
1721   if (m)
1722     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724   if (PetscDefined(USE_DEBUG)) {
1725     int              numerical_zero;
1726     cusparseStatus_t status;
1727     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729   }
1730 
1731   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1732      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1733   */
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1735 
1736   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1737 
1738   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1739   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740 
1741   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1742   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1743   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1744   fact->ops->matsolve          = NULL;
1745   fact->ops->matsolvetranspose = NULL;
1746   PetscCall(PetscLogGpuTimeEnd());
1747   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1748   PetscFunctionReturn(PETSC_SUCCESS);
1749 }
1750 
1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1752 {
1753   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1754   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1755   PetscInt                      m, nz;
1756 
1757   PetscFunctionBegin;
1758   if (PetscDefined(USE_DEBUG)) {
1759     PetscInt  i;
1760     PetscBool flg, missing;
1761 
1762     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1763     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1764     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1765     PetscCall(MatMissingDiagonal(A, &missing, &i));
1766     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1767   }
1768 
1769   /* Free the old stale stuff */
1770   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1771 
1772   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1773      but they will not be used. Allocate them just for easy debugging.
1774    */
1775   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1776 
1777   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1778   fact->factortype             = MAT_FACTOR_ILU;
1779   fact->info.factor_mallocs    = 0;
1780   fact->info.fill_ratio_given  = info->fill;
1781   fact->info.fill_ratio_needed = 1.0;
1782 
1783   aij->row = NULL;
1784   aij->col = NULL;
1785 
1786   /* ====================================================================== */
1787   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1788   /* We'll do in-place factorization on fact                                */
1789   /* ====================================================================== */
1790   const int *Ai, *Aj;
1791 
1792   m  = fact->rmap->n;
1793   nz = aij->nz;
1794 
1795   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1796   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1797   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1798   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1799   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801 
1802   /* ====================================================================== */
1803   /* Create descriptors for M, L, U                                         */
1804   /* ====================================================================== */
1805   cusparseFillMode_t fillMode;
1806   cusparseDiagType_t diagType;
1807 
1808   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1809   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1810   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1811 
1812   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1813     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1814     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1815     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1816     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1817   */
1818   fillMode = CUSPARSE_FILL_MODE_LOWER;
1819   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1820   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1821   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1822   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1823 
1824   fillMode = CUSPARSE_FILL_MODE_UPPER;
1825   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1826   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1827   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1828   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1829 
1830   /* ========================================================================= */
1831   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1832   /* ========================================================================= */
1833   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1834   if (m)
1835     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1836                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1837 
1838   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1839   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1840 
1841   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1842   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1846 
1847   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1848   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1849 
1850   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1851      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1852      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1853      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1854    */
1855   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1856     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1857     fs->spsvBuffer_L = fs->factBuffer_M;
1858     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1859   } else {
1860     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1861     fs->spsvBuffer_U = fs->factBuffer_M;
1862     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1863   }
1864 
1865   /* ========================================================================== */
1866   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1867   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1868   /* ========================================================================== */
1869   int              structural_zero;
1870   cusparseStatus_t status;
1871 
1872   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1873   if (m)
1874     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1875                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1876   if (PetscDefined(USE_DEBUG)) {
1877     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1878     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1879     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1880   }
1881 
1882   /* Estimate FLOPs of the numeric factorization */
1883   {
1884     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1885     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1886     PetscLogDouble flops = 0.0;
1887 
1888     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1889     Ai    = Aseq->i;
1890     Adiag = Aseq->diag;
1891     for (PetscInt i = 0; i < m; i++) {
1892       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1893         nzRow  = Ai[i + 1] - Ai[i];
1894         nzLeft = Adiag[i] - Ai[i];
1895         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1896           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1897         */
1898         nzLeft = (nzRow - 1) / 2;
1899         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1900       }
1901     }
1902     fs->numericFactFlops = flops;
1903   }
1904   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1905   PetscFunctionReturn(PETSC_SUCCESS);
1906 }
1907 
1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1909 {
1910   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1911   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1912   const PetscScalar            *barray;
1913   PetscScalar                  *xarray;
1914 
1915   PetscFunctionBegin;
1916   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1917   PetscCall(VecCUDAGetArrayRead(b, &barray));
1918   PetscCall(PetscLogGpuTimeBegin());
1919 
1920   /* Solve L*y = b */
1921   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1922   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1923   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1924                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1925 
1926   /* Solve Lt*x = y */
1927   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1928   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1929                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1930 
1931   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1932   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1933 
1934   PetscCall(PetscLogGpuTimeEnd());
1935   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1936   PetscFunctionReturn(PETSC_SUCCESS);
1937 }
1938 
1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1940 {
1941   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1942   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1943   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1944   CsrMatrix                    *Acsr;
1945   PetscInt                      m, nz;
1946   PetscBool                     flg;
1947 
1948   PetscFunctionBegin;
1949   if (PetscDefined(USE_DEBUG)) {
1950     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1952   }
1953 
1954   /* Copy A's value to fact */
1955   m  = fact->rmap->n;
1956   nz = aij->nz;
1957   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1958   Acsr = (CsrMatrix *)Acusp->mat->mat;
1959   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1960 
1961   /* Factorize fact inplace */
1962   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1963      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1964      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1965      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1966      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1967    */
1968   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1969   if (PetscDefined(USE_DEBUG)) {
1970     int              numerical_zero;
1971     cusparseStatus_t status;
1972     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1973     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1974   }
1975 
1976   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1977 
1978   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1979     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1980   */
1981   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1982 
1983   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1984   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1985   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1986   fact->ops->matsolve          = NULL;
1987   fact->ops->matsolvetranspose = NULL;
1988   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1989   PetscFunctionReturn(PETSC_SUCCESS);
1990 }
1991 
1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1993 {
1994   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1995   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1996   PetscInt                      m, nz;
1997 
1998   PetscFunctionBegin;
1999   if (PetscDefined(USE_DEBUG)) {
2000     PetscInt  i;
2001     PetscBool flg, missing;
2002 
2003     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2004     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2005     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2006     PetscCall(MatMissingDiagonal(A, &missing, &i));
2007     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2008   }
2009 
2010   /* Free the old stale stuff */
2011   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2012 
2013   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2014      but they will not be used. Allocate them just for easy debugging.
2015    */
2016   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2017 
2018   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2019   fact->factortype             = MAT_FACTOR_ICC;
2020   fact->info.factor_mallocs    = 0;
2021   fact->info.fill_ratio_given  = info->fill;
2022   fact->info.fill_ratio_needed = 1.0;
2023 
2024   aij->row = NULL;
2025   aij->col = NULL;
2026 
2027   /* ====================================================================== */
2028   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2029   /* We'll do in-place factorization on fact                                */
2030   /* ====================================================================== */
2031   const int *Ai, *Aj;
2032 
2033   m  = fact->rmap->n;
2034   nz = aij->nz;
2035 
2036   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2037   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2038   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2039   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2040   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2042 
2043   /* ====================================================================== */
2044   /* Create mat descriptors for M, L                                        */
2045   /* ====================================================================== */
2046   cusparseFillMode_t fillMode;
2047   cusparseDiagType_t diagType;
2048 
2049   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2050   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2051   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2052 
2053   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2054     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2055     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2056     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2057     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2058   */
2059   fillMode = CUSPARSE_FILL_MODE_LOWER;
2060   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2061   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2062   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2063   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2064 
2065   /* ========================================================================= */
2066   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2067   /* ========================================================================= */
2068   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2069   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2070 
2071   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2072   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2073 
2074   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2075   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2079 
2080   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2081   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2082 
2083   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2084      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2085    */
2086   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2087     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2088     fs->spsvBuffer_L = fs->factBuffer_M;
2089     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2090   } else {
2091     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2092     fs->spsvBuffer_Lt = fs->factBuffer_M;
2093     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2094   }
2095 
2096   /* ========================================================================== */
2097   /* Perform analysis of ic0 on M                                               */
2098   /* The lower triangular part of M has the same sparsity pattern as L          */
2099   /* ========================================================================== */
2100   int              structural_zero;
2101   cusparseStatus_t status;
2102 
2103   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2104   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2105   if (PetscDefined(USE_DEBUG)) {
2106     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2107     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2108     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2109   }
2110 
2111   /* Estimate FLOPs of the numeric factorization */
2112   {
2113     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2114     PetscInt      *Ai, nzRow, nzLeft;
2115     PetscLogDouble flops = 0.0;
2116 
2117     Ai = Aseq->i;
2118     for (PetscInt i = 0; i < m; i++) {
2119       nzRow = Ai[i + 1] - Ai[i];
2120       if (nzRow > 1) {
2121         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2122           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2123         */
2124         nzLeft = (nzRow - 1) / 2;
2125         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2126       }
2127     }
2128     fs->numericFactFlops = flops;
2129   }
2130   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2131   PetscFunctionReturn(PETSC_SUCCESS);
2132 }
2133 #endif
2134 
2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2136 {
2137   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2138   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2139 
2140   PetscFunctionBegin;
2141   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2142   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2143   B->offloadmask = PETSC_OFFLOAD_CPU;
2144 
2145   if (!cusparsestruct->use_cpu_solve) {
2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2147     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2148     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2149 #else
2150     /* determine which version of MatSolve needs to be used. */
2151     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2152     IS          isrow = b->row, iscol = b->col;
2153     PetscBool   row_identity, col_identity;
2154 
2155     PetscCall(ISIdentity(isrow, &row_identity));
2156     PetscCall(ISIdentity(iscol, &col_identity));
2157     if (row_identity && col_identity) {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2160     } else {
2161       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2162       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2163     }
2164 #endif
2165   }
2166   B->ops->matsolve          = NULL;
2167   B->ops->matsolvetranspose = NULL;
2168 
2169   /* get the triangular factors */
2170   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2175 {
2176   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2177 
2178   PetscFunctionBegin;
2179   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2180   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2181   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2182   PetscFunctionReturn(PETSC_SUCCESS);
2183 }
2184 
2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186 {
2187   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2188 
2189   PetscFunctionBegin;
2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2191   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2192   if (!info->factoronhost) {
2193     PetscCall(ISIdentity(isrow, &row_identity));
2194     PetscCall(ISIdentity(iscol, &col_identity));
2195   }
2196   if (!info->levels && row_identity && col_identity) {
2197     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2198   } else
2199 #endif
2200   {
2201     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2202     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2203     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2204   }
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2209 {
2210   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2211 
2212   PetscFunctionBegin;
2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2214   PetscBool perm_identity = PETSC_FALSE;
2215   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2216   if (!info->levels && perm_identity) {
2217     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2218   } else
2219 #endif
2220   {
2221     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2222     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2223     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2224   }
2225   PetscFunctionReturn(PETSC_SUCCESS);
2226 }
2227 
2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2229 {
2230   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2231 
2232   PetscFunctionBegin;
2233   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2234   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2235   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2236   PetscFunctionReturn(PETSC_SUCCESS);
2237 }
2238 
2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2240 {
2241   PetscFunctionBegin;
2242   *type = MATSOLVERCUSPARSE;
2243   PetscFunctionReturn(PETSC_SUCCESS);
2244 }
2245 
2246 /*MC
2247   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2248   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2249   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2250   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2251   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2252   algorithms are not recommended. This class does NOT support direct solver operations.
2253 
2254   Level: beginner
2255 
2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2257           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2258 M*/
2259 
2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2261 {
2262   PetscInt n = A->rmap->n;
2263 
2264   PetscFunctionBegin;
2265   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266   PetscCall(MatSetSizes(*B, n, n, n, n));
2267   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2269 
2270   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2271   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2272     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2273     if (!A->boundtocpu) {
2274       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2275       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2276     } else {
2277       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2278       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2279     }
2280     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2281     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2282     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2283   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2284     if (!A->boundtocpu) {
2285       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2286       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2287     } else {
2288       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2289       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2290     }
2291     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2292     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2293   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2294 
2295   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2296   (*B)->canuseordering = PETSC_TRUE;
2297   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2298   PetscFunctionReturn(PETSC_SUCCESS);
2299 }
2300 
2301 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2302 {
2303   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2304   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2305 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2306   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2307 #endif
2308 
2309   PetscFunctionBegin;
2310   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2311     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2312     if (A->factortype == MAT_FACTOR_NONE) {
2313       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2314       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2315     }
2316 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2317     else if (fs->csrVal) {
2318       /* We have a factorized matrix on device and are able to copy it to host */
2319       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2320     }
2321 #endif
2322     else
2323       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2324     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2325     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2326     A->offloadmask = PETSC_OFFLOAD_BOTH;
2327   }
2328   PetscFunctionReturn(PETSC_SUCCESS);
2329 }
2330 
2331 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2332 {
2333   PetscFunctionBegin;
2334   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2335   *array = ((Mat_SeqAIJ *)A->data)->a;
2336   PetscFunctionReturn(PETSC_SUCCESS);
2337 }
2338 
2339 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2340 {
2341   PetscFunctionBegin;
2342   A->offloadmask = PETSC_OFFLOAD_CPU;
2343   *array         = NULL;
2344   PetscFunctionReturn(PETSC_SUCCESS);
2345 }
2346 
2347 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2348 {
2349   PetscFunctionBegin;
2350   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2351   *array = ((Mat_SeqAIJ *)A->data)->a;
2352   PetscFunctionReturn(PETSC_SUCCESS);
2353 }
2354 
2355 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2356 {
2357   PetscFunctionBegin;
2358   *array = NULL;
2359   PetscFunctionReturn(PETSC_SUCCESS);
2360 }
2361 
2362 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2363 {
2364   PetscFunctionBegin;
2365   *array = ((Mat_SeqAIJ *)A->data)->a;
2366   PetscFunctionReturn(PETSC_SUCCESS);
2367 }
2368 
2369 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2370 {
2371   PetscFunctionBegin;
2372   A->offloadmask = PETSC_OFFLOAD_CPU;
2373   *array         = NULL;
2374   PetscFunctionReturn(PETSC_SUCCESS);
2375 }
2376 
2377 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2378 {
2379   Mat_SeqAIJCUSPARSE *cusp;
2380   CsrMatrix          *matrix;
2381 
2382   PetscFunctionBegin;
2383   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2384   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2385   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2386   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2387   matrix = (CsrMatrix *)cusp->mat->mat;
2388 
2389   if (i) {
2390 #if !defined(PETSC_USE_64BIT_INDICES)
2391     *i = matrix->row_offsets->data().get();
2392 #else
2393     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2394 #endif
2395   }
2396   if (j) {
2397 #if !defined(PETSC_USE_64BIT_INDICES)
2398     *j = matrix->column_indices->data().get();
2399 #else
2400     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2401 #endif
2402   }
2403   if (a) *a = matrix->values->data().get();
2404   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2405   PetscFunctionReturn(PETSC_SUCCESS);
2406 }
2407 
2408 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2409 {
2410   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2411   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2412   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2413   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2414   cusparseStatus_t              stat;
2415   PetscBool                     both = PETSC_TRUE;
2416 
2417   PetscFunctionBegin;
2418   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2419   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2420     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2421       CsrMatrix *matrix;
2422       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2423 
2424       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2425       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2426       matrix->values->assign(a->a, a->a + a->nz);
2427       PetscCallCUDA(WaitForCUDA());
2428       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2429       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2430       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2431     } else {
2432       PetscInt nnz;
2433       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2434       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2435       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2436       delete cusparsestruct->workVector;
2437       delete cusparsestruct->rowoffsets_gpu;
2438       cusparsestruct->workVector     = NULL;
2439       cusparsestruct->rowoffsets_gpu = NULL;
2440       try {
2441         if (a->compressedrow.use) {
2442           m    = a->compressedrow.nrows;
2443           ii   = a->compressedrow.i;
2444           ridx = a->compressedrow.rindex;
2445         } else {
2446           m    = A->rmap->n;
2447           ii   = a->i;
2448           ridx = NULL;
2449         }
2450         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2451         if (!a->a) {
2452           nnz  = ii[m];
2453           both = PETSC_FALSE;
2454         } else nnz = a->nz;
2455         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2456 
2457         /* create cusparse matrix */
2458         cusparsestruct->nrows = m;
2459         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2460         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2461         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2462         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2463 
2464         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2465         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2466         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2467         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2468         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2469         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2470         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2471 
2472         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2473         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2474           /* set the matrix */
2475           CsrMatrix *mat   = new CsrMatrix;
2476           mat->num_rows    = m;
2477           mat->num_cols    = A->cmap->n;
2478           mat->num_entries = nnz;
2479           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2480           mat->row_offsets->assign(ii, ii + m + 1);
2481 
2482           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2483           mat->column_indices->assign(a->j, a->j + nnz);
2484 
2485           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2486           if (a->a) mat->values->assign(a->a, a->a + nnz);
2487 
2488           /* assign the pointer */
2489           matstruct->mat = mat;
2490 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2491           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2492             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2493                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2494             PetscCallCUSPARSE(stat);
2495           }
2496 #endif
2497         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2498 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2499           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2500 #else
2501           CsrMatrix *mat   = new CsrMatrix;
2502           mat->num_rows    = m;
2503           mat->num_cols    = A->cmap->n;
2504           mat->num_entries = nnz;
2505           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2506           mat->row_offsets->assign(ii, ii + m + 1);
2507 
2508           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2509           mat->column_indices->assign(a->j, a->j + nnz);
2510 
2511           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2512           if (a->a) mat->values->assign(a->a, a->a + nnz);
2513 
2514           cusparseHybMat_t hybMat;
2515           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2516           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2517           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2518           PetscCallCUSPARSE(stat);
2519           /* assign the pointer */
2520           matstruct->mat = hybMat;
2521 
2522           if (mat) {
2523             if (mat->values) delete (THRUSTARRAY *)mat->values;
2524             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2525             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2526             delete (CsrMatrix *)mat;
2527           }
2528 #endif
2529         }
2530 
2531         /* assign the compressed row indices */
2532         if (a->compressedrow.use) {
2533           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2534           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2535           matstruct->cprowIndices->assign(ridx, ridx + m);
2536           tmp = m;
2537         } else {
2538           cusparsestruct->workVector = NULL;
2539           matstruct->cprowIndices    = NULL;
2540           tmp                        = 0;
2541         }
2542         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2543 
2544         /* assign the pointer */
2545         cusparsestruct->mat = matstruct;
2546       } catch (char *ex) {
2547         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2548       }
2549       PetscCallCUDA(WaitForCUDA());
2550       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2551       cusparsestruct->nonzerostate = A->nonzerostate;
2552     }
2553     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2554   }
2555   PetscFunctionReturn(PETSC_SUCCESS);
2556 }
2557 
2558 struct VecCUDAPlusEquals {
2559   template <typename Tuple>
2560   __host__ __device__ void operator()(Tuple t)
2561   {
2562     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2563   }
2564 };
2565 
2566 struct VecCUDAEquals {
2567   template <typename Tuple>
2568   __host__ __device__ void operator()(Tuple t)
2569   {
2570     thrust::get<1>(t) = thrust::get<0>(t);
2571   }
2572 };
2573 
2574 struct VecCUDAEqualsReverse {
2575   template <typename Tuple>
2576   __host__ __device__ void operator()(Tuple t)
2577   {
2578     thrust::get<0>(t) = thrust::get<1>(t);
2579   }
2580 };
2581 
2582 struct MatMatCusparse {
2583   PetscBool      cisdense;
2584   PetscScalar   *Bt;
2585   Mat            X;
2586   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2587   PetscLogDouble flops;
2588   CsrMatrix     *Bcsr;
2589 
2590 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2591   cusparseSpMatDescr_t matSpBDescr;
2592   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2593   cusparseDnMatDescr_t matBDescr;
2594   cusparseDnMatDescr_t matCDescr;
2595   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2596   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2597   void *dBuffer4;
2598   void *dBuffer5;
2599   #endif
2600   size_t                mmBufferSize;
2601   void                 *mmBuffer;
2602   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2603   cusparseSpGEMMDescr_t spgemmDesc;
2604 #endif
2605 };
2606 
2607 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2608 {
2609   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2610 
2611   PetscFunctionBegin;
2612   PetscCallCUDA(cudaFree(mmdata->Bt));
2613   delete mmdata->Bcsr;
2614 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2615   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2616   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2617   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2618   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2619   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2620   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2621   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2622   #endif
2623   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2624   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2625 #endif
2626   PetscCall(MatDestroy(&mmdata->X));
2627   PetscCall(PetscFree(data));
2628   PetscFunctionReturn(PETSC_SUCCESS);
2629 }
2630 
2631 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2632 
2633 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2634 {
2635   Mat_Product                  *product = C->product;
2636   Mat                           A, B;
2637   PetscInt                      m, n, blda, clda;
2638   PetscBool                     flg, biscuda;
2639   Mat_SeqAIJCUSPARSE           *cusp;
2640   cusparseStatus_t              stat;
2641   cusparseOperation_t           opA;
2642   const PetscScalar            *barray;
2643   PetscScalar                  *carray;
2644   MatMatCusparse               *mmdata;
2645   Mat_SeqAIJCUSPARSEMultStruct *mat;
2646   CsrMatrix                    *csrmat;
2647 
2648   PetscFunctionBegin;
2649   MatCheckProduct(C, 1);
2650   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2651   mmdata = (MatMatCusparse *)product->data;
2652   A      = product->A;
2653   B      = product->B;
2654   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2655   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2656   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2657      Instead of silently accepting the wrong answer, I prefer to raise the error */
2658   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2659   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2660   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2661   switch (product->type) {
2662   case MATPRODUCT_AB:
2663   case MATPRODUCT_PtAP:
2664     mat = cusp->mat;
2665     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2666     m   = A->rmap->n;
2667     n   = B->cmap->n;
2668     break;
2669   case MATPRODUCT_AtB:
2670     if (!A->form_explicit_transpose) {
2671       mat = cusp->mat;
2672       opA = CUSPARSE_OPERATION_TRANSPOSE;
2673     } else {
2674       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2675       mat = cusp->matTranspose;
2676       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2677     }
2678     m = A->cmap->n;
2679     n = B->cmap->n;
2680     break;
2681   case MATPRODUCT_ABt:
2682   case MATPRODUCT_RARt:
2683     mat = cusp->mat;
2684     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2685     m   = A->rmap->n;
2686     n   = B->rmap->n;
2687     break;
2688   default:
2689     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2690   }
2691   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2692   csrmat = (CsrMatrix *)mat->mat;
2693   /* if the user passed a CPU matrix, copy the data to the GPU */
2694   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2695   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2696   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2697 
2698   PetscCall(MatDenseGetLDA(B, &blda));
2699   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2700     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2701     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2702   } else {
2703     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2704     PetscCall(MatDenseGetLDA(C, &clda));
2705   }
2706 
2707   PetscCall(PetscLogGpuTimeBegin());
2708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2709   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2710   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2711   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2712   #else
2713   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2714   #endif
2715 
2716   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2717   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2718     size_t mmBufferSize;
2719     if (mmdata->initialized && mmdata->Blda != blda) {
2720       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2721       mmdata->matBDescr = NULL;
2722     }
2723     if (!mmdata->matBDescr) {
2724       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2725       mmdata->Blda = blda;
2726     }
2727 
2728     if (mmdata->initialized && mmdata->Clda != clda) {
2729       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2730       mmdata->matCDescr = NULL;
2731     }
2732     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2733       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2734       mmdata->Clda = clda;
2735     }
2736 
2737   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2738     if (matADescr) {
2739       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2740       matADescr = NULL;
2741     }
2742   #endif
2743 
2744     if (!matADescr) {
2745       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2746                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2747       PetscCallCUSPARSE(stat);
2748     }
2749 
2750     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2751 
2752     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2753       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2754       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2755       mmdata->mmBufferSize = mmBufferSize;
2756     }
2757 
2758   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0
2759     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2760   #endif
2761 
2762     mmdata->initialized = PETSC_TRUE;
2763   } else {
2764     /* to be safe, always update pointers of the mats */
2765     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2766     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2767     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2768   }
2769 
2770   /* do cusparseSpMM, which supports transpose on B */
2771   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2772 #else
2773   PetscInt k;
2774   /* cusparseXcsrmm does not support transpose on B */
2775   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2776     cublasHandle_t cublasv2handle;
2777     cublasStatus_t cerr;
2778 
2779     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2780     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2781     PetscCallCUBLAS(cerr);
2782     blda = B->cmap->n;
2783     k    = B->cmap->n;
2784   } else {
2785     k = B->rmap->n;
2786   }
2787 
2788   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2789   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2790   PetscCallCUSPARSE(stat);
2791 #endif
2792   PetscCall(PetscLogGpuTimeEnd());
2793   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2794   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2795   if (product->type == MATPRODUCT_RARt) {
2796     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2797     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2798   } else if (product->type == MATPRODUCT_PtAP) {
2799     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2800     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2801   } else {
2802     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2803   }
2804   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2805   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2806   PetscFunctionReturn(PETSC_SUCCESS);
2807 }
2808 
2809 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2810 {
2811   Mat_Product        *product = C->product;
2812   Mat                 A, B;
2813   PetscInt            m, n;
2814   PetscBool           cisdense, flg;
2815   MatMatCusparse     *mmdata;
2816   Mat_SeqAIJCUSPARSE *cusp;
2817 
2818   PetscFunctionBegin;
2819   MatCheckProduct(C, 1);
2820   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2821   A = product->A;
2822   B = product->B;
2823   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2824   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2825   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2826   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2827   switch (product->type) {
2828   case MATPRODUCT_AB:
2829     m = A->rmap->n;
2830     n = B->cmap->n;
2831     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2832     break;
2833   case MATPRODUCT_AtB:
2834     m = A->cmap->n;
2835     n = B->cmap->n;
2836     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2837     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2838     break;
2839   case MATPRODUCT_ABt:
2840     m = A->rmap->n;
2841     n = B->rmap->n;
2842     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2843     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2844     break;
2845   case MATPRODUCT_PtAP:
2846     m = B->cmap->n;
2847     n = B->cmap->n;
2848     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2849     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2850     break;
2851   case MATPRODUCT_RARt:
2852     m = B->rmap->n;
2853     n = B->rmap->n;
2854     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2855     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2856     break;
2857   default:
2858     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2859   }
2860   PetscCall(MatSetSizes(C, m, n, m, n));
2861   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2862   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2863   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2864 
2865   /* product data */
2866   PetscCall(PetscNew(&mmdata));
2867   mmdata->cisdense = cisdense;
2868 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2869   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2870   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2871 #endif
2872   /* for these products we need intermediate storage */
2873   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2874     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2875     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2876     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2877       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2878     } else {
2879       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2880     }
2881   }
2882   C->product->data    = mmdata;
2883   C->product->destroy = MatDestroy_MatMatCusparse;
2884 
2885   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2886   PetscFunctionReturn(PETSC_SUCCESS);
2887 }
2888 
2889 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2890 {
2891   Mat_Product                  *product = C->product;
2892   Mat                           A, B;
2893   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2894   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2895   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2896   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2897   PetscBool                     flg;
2898   cusparseStatus_t              stat;
2899   MatProductType                ptype;
2900   MatMatCusparse               *mmdata;
2901 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2902   cusparseSpMatDescr_t BmatSpDescr;
2903 #endif
2904   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2905 
2906   PetscFunctionBegin;
2907   MatCheckProduct(C, 1);
2908   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2909   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2910   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2911   mmdata = (MatMatCusparse *)C->product->data;
2912   A      = product->A;
2913   B      = product->B;
2914   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2915     mmdata->reusesym = PETSC_FALSE;
2916     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2917     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918     Cmat = Ccusp->mat;
2919     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2920     Ccsr = (CsrMatrix *)Cmat->mat;
2921     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2922     goto finalize;
2923   }
2924   if (!c->nz) goto finalize;
2925   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2926   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2927   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2928   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2929   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2930   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2931   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2932   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2933   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2934   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2935   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2936   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2937   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2938   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2939 
2940   ptype = product->type;
2941   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2942     ptype = MATPRODUCT_AB;
2943     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2944   }
2945   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2946     ptype = MATPRODUCT_AB;
2947     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2948   }
2949   switch (ptype) {
2950   case MATPRODUCT_AB:
2951     Amat = Acusp->mat;
2952     Bmat = Bcusp->mat;
2953     break;
2954   case MATPRODUCT_AtB:
2955     Amat = Acusp->matTranspose;
2956     Bmat = Bcusp->mat;
2957     break;
2958   case MATPRODUCT_ABt:
2959     Amat = Acusp->mat;
2960     Bmat = Bcusp->matTranspose;
2961     break;
2962   default:
2963     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2964   }
2965   Cmat = Ccusp->mat;
2966   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2967   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2968   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2969   Acsr = (CsrMatrix *)Amat->mat;
2970   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2971   Ccsr = (CsrMatrix *)Cmat->mat;
2972   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2973   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2974   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2975   PetscCall(PetscLogGpuTimeBegin());
2976 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2977   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2978   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2979   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2980   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2981   PetscCallCUSPARSE(stat);
2982   #else
2983   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2984   PetscCallCUSPARSE(stat);
2985   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2986   PetscCallCUSPARSE(stat);
2987   #endif
2988 #else
2989   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2990                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2991   PetscCallCUSPARSE(stat);
2992 #endif
2993   PetscCall(PetscLogGpuFlops(mmdata->flops));
2994   PetscCallCUDA(WaitForCUDA());
2995   PetscCall(PetscLogGpuTimeEnd());
2996   C->offloadmask = PETSC_OFFLOAD_GPU;
2997 finalize:
2998   /* shorter version of MatAssemblyEnd_SeqAIJ */
2999   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3000   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3001   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3002   c->reallocs = 0;
3003   C->info.mallocs += 0;
3004   C->info.nz_unneeded = 0;
3005   C->assembled = C->was_assembled = PETSC_TRUE;
3006   C->num_ass++;
3007   PetscFunctionReturn(PETSC_SUCCESS);
3008 }
3009 
3010 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3011 {
3012   Mat_Product                  *product = C->product;
3013   Mat                           A, B;
3014   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3015   Mat_SeqAIJ                   *a, *b, *c;
3016   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3017   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3018   PetscInt                      i, j, m, n, k;
3019   PetscBool                     flg;
3020   cusparseStatus_t              stat;
3021   MatProductType                ptype;
3022   MatMatCusparse               *mmdata;
3023   PetscLogDouble                flops;
3024   PetscBool                     biscompressed, ciscompressed;
3025 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3026   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3027   cusparseSpMatDescr_t BmatSpDescr;
3028 #else
3029   int cnz;
3030 #endif
3031   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3032 
3033   PetscFunctionBegin;
3034   MatCheckProduct(C, 1);
3035   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3036   A = product->A;
3037   B = product->B;
3038   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3039   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3040   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3041   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3042   a = (Mat_SeqAIJ *)A->data;
3043   b = (Mat_SeqAIJ *)B->data;
3044   /* product data */
3045   PetscCall(PetscNew(&mmdata));
3046   C->product->data    = mmdata;
3047   C->product->destroy = MatDestroy_MatMatCusparse;
3048 
3049   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3050   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3051   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3052   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3053   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3054   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3055 
3056   ptype = product->type;
3057   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3058     ptype                                          = MATPRODUCT_AB;
3059     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3060   }
3061   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3062     ptype                                          = MATPRODUCT_AB;
3063     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3064   }
3065   biscompressed = PETSC_FALSE;
3066   ciscompressed = PETSC_FALSE;
3067   switch (ptype) {
3068   case MATPRODUCT_AB:
3069     m    = A->rmap->n;
3070     n    = B->cmap->n;
3071     k    = A->cmap->n;
3072     Amat = Acusp->mat;
3073     Bmat = Bcusp->mat;
3074     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3075     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3076     break;
3077   case MATPRODUCT_AtB:
3078     m = A->cmap->n;
3079     n = B->cmap->n;
3080     k = A->rmap->n;
3081     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3082     Amat = Acusp->matTranspose;
3083     Bmat = Bcusp->mat;
3084     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3085     break;
3086   case MATPRODUCT_ABt:
3087     m = A->rmap->n;
3088     n = B->rmap->n;
3089     k = A->cmap->n;
3090     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3091     Amat = Acusp->mat;
3092     Bmat = Bcusp->matTranspose;
3093     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3094     break;
3095   default:
3096     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3097   }
3098 
3099   /* create cusparse matrix */
3100   PetscCall(MatSetSizes(C, m, n, m, n));
3101   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3102   c     = (Mat_SeqAIJ *)C->data;
3103   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3104   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3105   Ccsr  = new CsrMatrix;
3106 
3107   c->compressedrow.use = ciscompressed;
3108   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3109     c->compressedrow.nrows = a->compressedrow.nrows;
3110     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3111     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3112     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3113     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3114     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3115   } else {
3116     c->compressedrow.nrows  = 0;
3117     c->compressedrow.i      = NULL;
3118     c->compressedrow.rindex = NULL;
3119     Ccusp->workVector       = NULL;
3120     Cmat->cprowIndices      = NULL;
3121   }
3122   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3123   Ccusp->mat        = Cmat;
3124   Ccusp->mat->mat   = Ccsr;
3125   Ccsr->num_rows    = Ccusp->nrows;
3126   Ccsr->num_cols    = n;
3127   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3128   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3129   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3130   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3131   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3132   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3133   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3134   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3135   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3136   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3137   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3138     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3139     c->nz                = 0;
3140     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3141     Ccsr->values         = new THRUSTARRAY(c->nz);
3142     goto finalizesym;
3143   }
3144 
3145   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3146   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3147   Acsr = (CsrMatrix *)Amat->mat;
3148   if (!biscompressed) {
3149     Bcsr = (CsrMatrix *)Bmat->mat;
3150 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151     BmatSpDescr = Bmat->matDescr;
3152 #endif
3153   } else { /* we need to use row offsets for the full matrix */
3154     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3155     Bcsr                 = new CsrMatrix;
3156     Bcsr->num_rows       = B->rmap->n;
3157     Bcsr->num_cols       = cBcsr->num_cols;
3158     Bcsr->num_entries    = cBcsr->num_entries;
3159     Bcsr->column_indices = cBcsr->column_indices;
3160     Bcsr->values         = cBcsr->values;
3161     if (!Bcusp->rowoffsets_gpu) {
3162       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3163       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3164       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3165     }
3166     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3167     mmdata->Bcsr      = Bcsr;
3168 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3169     if (Bcsr->num_rows && Bcsr->num_cols) {
3170       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3171       PetscCallCUSPARSE(stat);
3172     }
3173     BmatSpDescr = mmdata->matSpBDescr;
3174 #endif
3175   }
3176   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3177   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3178   /* precompute flops count */
3179   if (ptype == MATPRODUCT_AB) {
3180     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3181       const PetscInt st = a->i[i];
3182       const PetscInt en = a->i[i + 1];
3183       for (j = st; j < en; j++) {
3184         const PetscInt brow = a->j[j];
3185         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3186       }
3187     }
3188   } else if (ptype == MATPRODUCT_AtB) {
3189     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3190       const PetscInt anzi = a->i[i + 1] - a->i[i];
3191       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3192       flops += (2. * anzi) * bnzi;
3193     }
3194   } else { /* TODO */
3195     flops = 0.;
3196   }
3197 
3198   mmdata->flops = flops;
3199   PetscCall(PetscLogGpuTimeBegin());
3200 
3201 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3202   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3203   // cuda-12.2 requires non-null csrRowOffsets
3204   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3205   PetscCallCUSPARSE(stat);
3206   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3207   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3208   {
3209     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3210      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3211   */
3212     void *dBuffer1 = NULL;
3213     void *dBuffer2 = NULL;
3214     void *dBuffer3 = NULL;
3215     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3216     size_t bufferSize1 = 0;
3217     size_t bufferSize2 = 0;
3218     size_t bufferSize3 = 0;
3219     size_t bufferSize4 = 0;
3220     size_t bufferSize5 = 0;
3221 
3222     /* ask bufferSize1 bytes for external memory */
3223     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3224     PetscCallCUSPARSE(stat);
3225     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3226     /* inspect the matrices A and B to understand the memory requirement for the next step */
3227     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3228     PetscCallCUSPARSE(stat);
3229 
3230     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3231     PetscCallCUSPARSE(stat);
3232     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3233     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3234     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3235     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3236     PetscCallCUSPARSE(stat);
3237     PetscCallCUDA(cudaFree(dBuffer1));
3238     PetscCallCUDA(cudaFree(dBuffer2));
3239 
3240     /* get matrix C non-zero entries C_nnz1 */
3241     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3242     c->nz = (PetscInt)C_nnz1;
3243     /* allocate matrix C */
3244     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3245     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3246     Ccsr->values = new THRUSTARRAY(c->nz);
3247     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3248     /* update matC with the new pointers */
3249     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3250     PetscCallCUSPARSE(stat);
3251 
3252     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3253     PetscCallCUSPARSE(stat);
3254     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3255     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3256     PetscCallCUSPARSE(stat);
3257     PetscCallCUDA(cudaFree(dBuffer3));
3258     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3259     PetscCallCUSPARSE(stat);
3260     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3261   }
3262   #else
3263   size_t bufSize2;
3264   /* ask bufferSize bytes for external memory */
3265   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3266   PetscCallCUSPARSE(stat);
3267   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3268   /* inspect the matrices A and B to understand the memory requirement for the next step */
3269   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3270   PetscCallCUSPARSE(stat);
3271   /* ask bufferSize again bytes for external memory */
3272   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3273   PetscCallCUSPARSE(stat);
3274   /* The CUSPARSE documentation is not clear, nor the API
3275      We need both buffers to perform the operations properly!
3276      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3277      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3278      is stored in the descriptor! What a messy API... */
3279   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3280   /* compute the intermediate product of A * B */
3281   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3282   PetscCallCUSPARSE(stat);
3283   /* get matrix C non-zero entries C_nnz1 */
3284   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3285   c->nz = (PetscInt)C_nnz1;
3286   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3287                       mmdata->mmBufferSize / 1024));
3288   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3289   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3290   Ccsr->values = new THRUSTARRAY(c->nz);
3291   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3292   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3293   PetscCallCUSPARSE(stat);
3294   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3295   PetscCallCUSPARSE(stat);
3296   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3297 #else
3298   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3299   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3300                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3301   PetscCallCUSPARSE(stat);
3302   c->nz                = cnz;
3303   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3304   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3305   Ccsr->values = new THRUSTARRAY(c->nz);
3306   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3307 
3308   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3309   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3310      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3311      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3312   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3313                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3314   PetscCallCUSPARSE(stat);
3315 #endif
3316   PetscCall(PetscLogGpuFlops(mmdata->flops));
3317   PetscCall(PetscLogGpuTimeEnd());
3318 finalizesym:
3319   c->free_a = PETSC_TRUE;
3320   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3321   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3322   c->free_ij = PETSC_TRUE;
3323   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3324     PetscInt      *d_i = c->i;
3325     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3326     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3327     ii = *Ccsr->row_offsets;
3328     jj = *Ccsr->column_indices;
3329     if (ciscompressed) d_i = c->compressedrow.i;
3330     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3331     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3332   } else {
3333     PetscInt *d_i = c->i;
3334     if (ciscompressed) d_i = c->compressedrow.i;
3335     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3336     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3337   }
3338   if (ciscompressed) { /* need to expand host row offsets */
3339     PetscInt r = 0;
3340     c->i[0]    = 0;
3341     for (k = 0; k < c->compressedrow.nrows; k++) {
3342       const PetscInt next = c->compressedrow.rindex[k];
3343       const PetscInt old  = c->compressedrow.i[k];
3344       for (; r < next; r++) c->i[r + 1] = old;
3345     }
3346     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3347   }
3348   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3349   PetscCall(PetscMalloc1(m, &c->ilen));
3350   PetscCall(PetscMalloc1(m, &c->imax));
3351   c->maxnz         = c->nz;
3352   c->nonzerorowcnt = 0;
3353   c->rmax          = 0;
3354   for (k = 0; k < m; k++) {
3355     const PetscInt nn = c->i[k + 1] - c->i[k];
3356     c->ilen[k] = c->imax[k] = nn;
3357     c->nonzerorowcnt += (PetscInt)!!nn;
3358     c->rmax = PetscMax(c->rmax, nn);
3359   }
3360   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3361   PetscCall(PetscMalloc1(c->nz, &c->a));
3362   Ccsr->num_entries = c->nz;
3363 
3364   C->nonzerostate++;
3365   PetscCall(PetscLayoutSetUp(C->rmap));
3366   PetscCall(PetscLayoutSetUp(C->cmap));
3367   Ccusp->nonzerostate = C->nonzerostate;
3368   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3369   C->preallocated     = PETSC_TRUE;
3370   C->assembled        = PETSC_FALSE;
3371   C->was_assembled    = PETSC_FALSE;
3372   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3373     mmdata->reusesym = PETSC_TRUE;
3374     C->offloadmask   = PETSC_OFFLOAD_GPU;
3375   }
3376   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3377   PetscFunctionReturn(PETSC_SUCCESS);
3378 }
3379 
3380 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3381 
3382 /* handles sparse or dense B */
3383 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3384 {
3385   Mat_Product *product = mat->product;
3386   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3387 
3388   PetscFunctionBegin;
3389   MatCheckProduct(mat, 1);
3390   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3391   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3392   if (product->type == MATPRODUCT_ABC) {
3393     Ciscusp = PETSC_FALSE;
3394     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3395   }
3396   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3397     PetscBool usecpu = PETSC_FALSE;
3398     switch (product->type) {
3399     case MATPRODUCT_AB:
3400       if (product->api_user) {
3401         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3402         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3403         PetscOptionsEnd();
3404       } else {
3405         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3406         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3407         PetscOptionsEnd();
3408       }
3409       break;
3410     case MATPRODUCT_AtB:
3411       if (product->api_user) {
3412         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3413         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3414         PetscOptionsEnd();
3415       } else {
3416         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3417         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3418         PetscOptionsEnd();
3419       }
3420       break;
3421     case MATPRODUCT_PtAP:
3422       if (product->api_user) {
3423         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3424         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3425         PetscOptionsEnd();
3426       } else {
3427         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3428         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3429         PetscOptionsEnd();
3430       }
3431       break;
3432     case MATPRODUCT_RARt:
3433       if (product->api_user) {
3434         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3435         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3436         PetscOptionsEnd();
3437       } else {
3438         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3439         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3440         PetscOptionsEnd();
3441       }
3442       break;
3443     case MATPRODUCT_ABC:
3444       if (product->api_user) {
3445         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3446         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3447         PetscOptionsEnd();
3448       } else {
3449         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3450         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3451         PetscOptionsEnd();
3452       }
3453       break;
3454     default:
3455       break;
3456     }
3457     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3458   }
3459   /* dispatch */
3460   if (isdense) {
3461     switch (product->type) {
3462     case MATPRODUCT_AB:
3463     case MATPRODUCT_AtB:
3464     case MATPRODUCT_ABt:
3465     case MATPRODUCT_PtAP:
3466     case MATPRODUCT_RARt:
3467       if (product->A->boundtocpu) {
3468         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3469       } else {
3470         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3471       }
3472       break;
3473     case MATPRODUCT_ABC:
3474       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3475       break;
3476     default:
3477       break;
3478     }
3479   } else if (Biscusp && Ciscusp) {
3480     switch (product->type) {
3481     case MATPRODUCT_AB:
3482     case MATPRODUCT_AtB:
3483     case MATPRODUCT_ABt:
3484       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3485       break;
3486     case MATPRODUCT_PtAP:
3487     case MATPRODUCT_RARt:
3488     case MATPRODUCT_ABC:
3489       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3490       break;
3491     default:
3492       break;
3493     }
3494   } else { /* fallback for AIJ */
3495     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3496   }
3497   PetscFunctionReturn(PETSC_SUCCESS);
3498 }
3499 
3500 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3501 {
3502   PetscFunctionBegin;
3503   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3504   PetscFunctionReturn(PETSC_SUCCESS);
3505 }
3506 
3507 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3508 {
3509   PetscFunctionBegin;
3510   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3511   PetscFunctionReturn(PETSC_SUCCESS);
3512 }
3513 
3514 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3515 {
3516   PetscFunctionBegin;
3517   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3518   PetscFunctionReturn(PETSC_SUCCESS);
3519 }
3520 
3521 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3522 {
3523   PetscFunctionBegin;
3524   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3525   PetscFunctionReturn(PETSC_SUCCESS);
3526 }
3527 
3528 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3529 {
3530   PetscFunctionBegin;
3531   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3532   PetscFunctionReturn(PETSC_SUCCESS);
3533 }
3534 
3535 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3536 {
3537   int i = blockIdx.x * blockDim.x + threadIdx.x;
3538   if (i < n) y[idx[i]] += x[i];
3539 }
3540 
3541 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3542 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3543 {
3544   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3545   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3546   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3547   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3548   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3549   PetscBool                     compressed;
3550 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3551   PetscInt nx, ny;
3552 #endif
3553 
3554   PetscFunctionBegin;
3555   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3556   if (!a->nz) {
3557     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3558     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3559     PetscFunctionReturn(PETSC_SUCCESS);
3560   }
3561   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3562   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3563   if (!trans) {
3564     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3565     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3566   } else {
3567     if (herm || !A->form_explicit_transpose) {
3568       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3569       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3570     } else {
3571       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3572       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3573     }
3574   }
3575   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3576   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3577 
3578   try {
3579     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3580     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3581     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3582 
3583     PetscCall(PetscLogGpuTimeBegin());
3584     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3585       /* z = A x + beta y.
3586          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3587          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3588       */
3589       xptr = xarray;
3590       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3591       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3592 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3593       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3594           allocated to accommodate different uses. So we get the length info directly from mat.
3595        */
3596       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3597         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3598         nx             = mat->num_cols; // since y = Ax
3599         ny             = mat->num_rows;
3600       }
3601 #endif
3602     } else {
3603       /* z = A^T x + beta y
3604          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3605          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3606        */
3607       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3608       dptr = zarray;
3609       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3610       if (compressed) { /* Scatter x to work vector */
3611         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3612 
3613         thrust::for_each(
3614 #if PetscDefined(HAVE_THRUST_ASYNC)
3615           thrust::cuda::par.on(PetscDefaultCudaStream),
3616 #endif
3617           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3618           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3619       }
3620 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3621       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3622         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3623         nx             = mat->num_rows; // since y = A^T x
3624         ny             = mat->num_cols;
3625       }
3626 #endif
3627     }
3628 
3629     /* csr_spmv does y = alpha op(A) x + beta y */
3630     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3632   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3633       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3634   #else
3635       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3636   #endif
3637 
3638       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3639   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3640       if (!matDescr) {
3641         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3642         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3643       }
3644   #endif
3645 
3646       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3647         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3648         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3649         PetscCallCUSPARSE(
3650           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3651         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3652   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3653         PetscCallCUSPARSE(
3654           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3655   #endif
3656         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3657       } else {
3658         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3659         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3660         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3661       }
3662 
3663       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664 #else
3665       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3666       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3667 #endif
3668     } else {
3669       if (cusparsestruct->nrows) {
3670 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3671         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3672 #else
3673         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3674         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3675 #endif
3676       }
3677     }
3678     PetscCall(PetscLogGpuTimeEnd());
3679 
3680     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3681       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3682         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3683           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3684         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3685           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3686         }
3687       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3688         PetscCall(VecSeq_CUDA::Set(zz, 0));
3689       }
3690 
3691       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3692       if (compressed) {
3693         PetscCall(PetscLogGpuTimeBegin());
3694         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3695            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3696            prevent that. So I just add a ScatterAdd kernel.
3697          */
3698 #if 0
3699         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3700         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3701                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3702                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3703                          VecCUDAPlusEquals());
3704 #else
3705         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3706         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3707 #endif
3708         PetscCall(PetscLogGpuTimeEnd());
3709       }
3710     } else {
3711       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3712     }
3713     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3714     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3715     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3716   } catch (char *ex) {
3717     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3718   }
3719   if (yy) {
3720     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3721   } else {
3722     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3723   }
3724   PetscFunctionReturn(PETSC_SUCCESS);
3725 }
3726 
3727 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3728 {
3729   PetscFunctionBegin;
3730   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3731   PetscFunctionReturn(PETSC_SUCCESS);
3732 }
3733 
3734 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3735 {
3736   PetscFunctionBegin;
3737   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3738   PetscFunctionReturn(PETSC_SUCCESS);
3739 }
3740 
3741 /*@
3742   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3743   (the default parallel PETSc format).
3744 
3745   Collective
3746 
3747   Input Parameters:
3748 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3749 . m    - number of rows
3750 . n    - number of columns
3751 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3752 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3753 
3754   Output Parameter:
3755 . A - the matrix
3756 
3757   Level: intermediate
3758 
3759   Notes:
3760   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3761   calculations. For good matrix assembly performance the user should preallocate the matrix
3762   storage by setting the parameter `nz` (or the array `nnz`).
3763 
3764   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3765   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3766   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3767 
3768   The AIJ format, also called
3769   compressed row storage, is fully compatible with standard Fortran
3770   storage.  That is, the stored row and column indices can begin at
3771   either one (as in Fortran) or zero.
3772 
3773   Specify the preallocated storage with either nz or nnz (not both).
3774   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3775   allocation.
3776 
3777 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3778 @*/
3779 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3780 {
3781   PetscFunctionBegin;
3782   PetscCall(MatCreate(comm, A));
3783   PetscCall(MatSetSizes(*A, m, n, m, n));
3784   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3785   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3786   PetscFunctionReturn(PETSC_SUCCESS);
3787 }
3788 
3789 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3790 {
3791   PetscFunctionBegin;
3792   if (A->factortype == MAT_FACTOR_NONE) {
3793     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3794   } else {
3795     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3796   }
3797   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3798   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3799   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3800   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3801   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3802   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3803   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3804   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3805   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3806   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3807   PetscCall(MatDestroy_SeqAIJ(A));
3808   PetscFunctionReturn(PETSC_SUCCESS);
3809 }
3810 
3811 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3812 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3813 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3814 {
3815   PetscFunctionBegin;
3816   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3817   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3818   PetscFunctionReturn(PETSC_SUCCESS);
3819 }
3820 
3821 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3822 {
3823   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3824   Mat_SeqAIJCUSPARSE *cy;
3825   Mat_SeqAIJCUSPARSE *cx;
3826   PetscScalar        *ay;
3827   const PetscScalar  *ax;
3828   CsrMatrix          *csry, *csrx;
3829 
3830   PetscFunctionBegin;
3831   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3832   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3833   if (X->ops->axpy != Y->ops->axpy) {
3834     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3835     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3836     PetscFunctionReturn(PETSC_SUCCESS);
3837   }
3838   /* if we are here, it means both matrices are bound to GPU */
3839   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3840   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3841   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3842   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3843   csry = (CsrMatrix *)cy->mat->mat;
3844   csrx = (CsrMatrix *)cx->mat->mat;
3845   /* see if we can turn this into a cublas axpy */
3846   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3847     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3848     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3849     if (eq) str = SAME_NONZERO_PATTERN;
3850   }
3851   /* spgeam is buggy with one column */
3852   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3853 
3854   if (str == SUBSET_NONZERO_PATTERN) {
3855     PetscScalar b = 1.0;
3856 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3857     size_t bufferSize;
3858     void  *buffer;
3859 #endif
3860 
3861     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3862     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3863     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3864 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3865     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3866                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3867     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3868     PetscCall(PetscLogGpuTimeBegin());
3869     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3870                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3871     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3872     PetscCall(PetscLogGpuTimeEnd());
3873     PetscCallCUDA(cudaFree(buffer));
3874 #else
3875     PetscCall(PetscLogGpuTimeBegin());
3876     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3877                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3878     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3879     PetscCall(PetscLogGpuTimeEnd());
3880 #endif
3881     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3882     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3883     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3884     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3885   } else if (str == SAME_NONZERO_PATTERN) {
3886     cublasHandle_t cublasv2handle;
3887     PetscBLASInt   one = 1, bnz = 1;
3888 
3889     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3890     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3891     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3892     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3893     PetscCall(PetscLogGpuTimeBegin());
3894     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3895     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3896     PetscCall(PetscLogGpuTimeEnd());
3897     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3898     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3899     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3900   } else {
3901     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3902     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3903   }
3904   PetscFunctionReturn(PETSC_SUCCESS);
3905 }
3906 
3907 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3908 {
3909   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3910   PetscScalar   *ay;
3911   cublasHandle_t cublasv2handle;
3912   PetscBLASInt   one = 1, bnz = 1;
3913 
3914   PetscFunctionBegin;
3915   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3916   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3917   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3918   PetscCall(PetscLogGpuTimeBegin());
3919   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3920   PetscCall(PetscLogGpuFlops(bnz));
3921   PetscCall(PetscLogGpuTimeEnd());
3922   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3923   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3924   PetscFunctionReturn(PETSC_SUCCESS);
3925 }
3926 
3927 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3928 {
3929   PetscBool   both = PETSC_FALSE;
3930   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3931 
3932   PetscFunctionBegin;
3933   if (A->factortype == MAT_FACTOR_NONE) {
3934     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3935     if (spptr->mat) {
3936       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3937       if (matrix->values) {
3938         both = PETSC_TRUE;
3939         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3940       }
3941     }
3942     if (spptr->matTranspose) {
3943       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3944       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3945     }
3946   }
3947   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3948   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3949   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3950   else A->offloadmask = PETSC_OFFLOAD_CPU;
3951   PetscFunctionReturn(PETSC_SUCCESS);
3952 }
3953 
3954 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3955 {
3956   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3957 
3958   PetscFunctionBegin;
3959   if (A->factortype != MAT_FACTOR_NONE) {
3960     A->boundtocpu = flg;
3961     PetscFunctionReturn(PETSC_SUCCESS);
3962   }
3963   if (flg) {
3964     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3965 
3966     A->ops->scale                     = MatScale_SeqAIJ;
3967     A->ops->axpy                      = MatAXPY_SeqAIJ;
3968     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3969     A->ops->mult                      = MatMult_SeqAIJ;
3970     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3971     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3972     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3973     A->ops->multhermitiantranspose    = NULL;
3974     A->ops->multhermitiantransposeadd = NULL;
3975     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3976     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3977     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3978     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3979     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3980     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3981     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3982     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3983   } else {
3984     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3985     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3986     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3987     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3988     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3989     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3990     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3991     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3992     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3993     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3994     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3995     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3996     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3997     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3998     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3999     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4000     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4001 
4002     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4003     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4004     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4005     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4006     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4007     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4008   }
4009   A->boundtocpu = flg;
4010   if (flg && a->inode.size) {
4011     a->inode.use = PETSC_TRUE;
4012   } else {
4013     a->inode.use = PETSC_FALSE;
4014   }
4015   PetscFunctionReturn(PETSC_SUCCESS);
4016 }
4017 
4018 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4019 {
4020   Mat B;
4021 
4022   PetscFunctionBegin;
4023   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4024   if (reuse == MAT_INITIAL_MATRIX) {
4025     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4026   } else if (reuse == MAT_REUSE_MATRIX) {
4027     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4028   }
4029   B = *newmat;
4030 
4031   PetscCall(PetscFree(B->defaultvectype));
4032   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4033 
4034   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4035     if (B->factortype == MAT_FACTOR_NONE) {
4036       Mat_SeqAIJCUSPARSE *spptr;
4037       PetscCall(PetscNew(&spptr));
4038       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4039       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4040       spptr->format = MAT_CUSPARSE_CSR;
4041 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4042   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4043       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4044   #else
4045       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4046   #endif
4047       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4048       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4049 #endif
4050       B->spptr = spptr;
4051     } else {
4052       Mat_SeqAIJCUSPARSETriFactors *spptr;
4053 
4054       PetscCall(PetscNew(&spptr));
4055       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4056       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4057       B->spptr = spptr;
4058     }
4059     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4060   }
4061   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4062   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4063   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4064   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4065   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4066   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4067 
4068   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4069   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4070   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4071 #if defined(PETSC_HAVE_HYPRE)
4072   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4073 #endif
4074   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4075   PetscFunctionReturn(PETSC_SUCCESS);
4076 }
4077 
4078 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4079 {
4080   PetscFunctionBegin;
4081   PetscCall(MatCreate_SeqAIJ(B));
4082   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4083   PetscFunctionReturn(PETSC_SUCCESS);
4084 }
4085 
4086 /*MC
4087    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4088 
4089    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4090    CSR, ELL, or Hybrid format.
4091    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4092 
4093    Options Database Keys:
4094 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4095 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4096                                       Other options include ell (ellpack) or hyb (hybrid).
4097 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4098 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4099 
4100   Level: beginner
4101 
4102 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4103 M*/
4104 
4105 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4106 {
4107   PetscFunctionBegin;
4108   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4109   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4110   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4111   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4112   PetscFunctionReturn(PETSC_SUCCESS);
4113 }
4114 
4115 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4116 {
4117   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4118 
4119   PetscFunctionBegin;
4120   if (cusp) {
4121     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4122     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4123     delete cusp->workVector;
4124     delete cusp->rowoffsets_gpu;
4125     delete cusp->csr2csc_i;
4126     delete cusp->coords;
4127     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4128     PetscCall(PetscFree(mat->spptr));
4129   }
4130   PetscFunctionReturn(PETSC_SUCCESS);
4131 }
4132 
4133 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4134 {
4135   PetscFunctionBegin;
4136   if (*mat) {
4137     delete (*mat)->values;
4138     delete (*mat)->column_indices;
4139     delete (*mat)->row_offsets;
4140     delete *mat;
4141     *mat = 0;
4142   }
4143   PetscFunctionReturn(PETSC_SUCCESS);
4144 }
4145 
4146 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4147 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4148 {
4149   PetscFunctionBegin;
4150   if (*trifactor) {
4151     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4152     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4153     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4154     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4155     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4156   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4157     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4158   #endif
4159     PetscCall(PetscFree(*trifactor));
4160   }
4161   PetscFunctionReturn(PETSC_SUCCESS);
4162 }
4163 #endif
4164 
4165 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4166 {
4167   CsrMatrix *mat;
4168 
4169   PetscFunctionBegin;
4170   if (*matstruct) {
4171     if ((*matstruct)->mat) {
4172       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4173 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4174         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4175 #else
4176         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4177         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4178 #endif
4179       } else {
4180         mat = (CsrMatrix *)(*matstruct)->mat;
4181         PetscCall(CsrMatrix_Destroy(&mat));
4182       }
4183     }
4184     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4185     delete (*matstruct)->cprowIndices;
4186     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4187     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4188     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4189 
4190 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4191     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4192     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4193 
4194     for (int i = 0; i < 3; i++) {
4195       if (mdata->cuSpMV[i].initialized) {
4196         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4197         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4198         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4199   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4200         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4201         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4202   #endif
4203       }
4204     }
4205 #endif
4206     delete *matstruct;
4207     *matstruct = NULL;
4208   }
4209   PetscFunctionReturn(PETSC_SUCCESS);
4210 }
4211 
4212 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4213 {
4214   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4215 
4216   PetscFunctionBegin;
4217   if (fs) {
4218 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4219     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4220     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4221     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4222     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4223     delete fs->workVector;
4224     fs->workVector = NULL;
4225 #endif
4226     delete fs->rpermIndices;
4227     delete fs->cpermIndices;
4228     fs->rpermIndices  = NULL;
4229     fs->cpermIndices  = NULL;
4230     fs->init_dev_prop = PETSC_FALSE;
4231 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4232     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4233     PetscCallCUDA(cudaFree(fs->csrColIdx));
4234     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4235     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4236     PetscCallCUDA(cudaFree(fs->csrVal));
4237     PetscCallCUDA(cudaFree(fs->diag));
4238     PetscCallCUDA(cudaFree(fs->X));
4239     PetscCallCUDA(cudaFree(fs->Y));
4240     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4241     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4242     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4243     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4244     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4245     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4246     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4247     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4248     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4249     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4250     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4251     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4252     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4253     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4254     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4255     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4256     PetscCall(PetscFree(fs->csrRowPtr_h));
4257     PetscCall(PetscFree(fs->csrVal_h));
4258     PetscCall(PetscFree(fs->diag_h));
4259     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4260     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4261 #endif
4262   }
4263   PetscFunctionReturn(PETSC_SUCCESS);
4264 }
4265 
4266 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4267 {
4268   PetscFunctionBegin;
4269   if (*trifactors) {
4270     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4271     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4272     PetscCall(PetscFree(*trifactors));
4273   }
4274   PetscFunctionReturn(PETSC_SUCCESS);
4275 }
4276 
4277 struct IJCompare {
4278   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4279   {
4280     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4281     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4282     return false;
4283   }
4284 };
4285 
4286 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4287 {
4288   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4289 
4290   PetscFunctionBegin;
4291   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4292   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4293   if (destroy) {
4294     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4295     delete cusp->csr2csc_i;
4296     cusp->csr2csc_i = NULL;
4297   }
4298   A->transupdated = PETSC_FALSE;
4299   PetscFunctionReturn(PETSC_SUCCESS);
4300 }
4301 
4302 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4303 {
4304   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4305 
4306   PetscFunctionBegin;
4307   PetscCallCUDA(cudaFree(coo->perm));
4308   PetscCallCUDA(cudaFree(coo->jmap));
4309   PetscCall(PetscFree(coo));
4310   PetscFunctionReturn(PETSC_SUCCESS);
4311 }
4312 
4313 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4314 {
4315   PetscBool            dev_ij = PETSC_FALSE;
4316   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4317   PetscInt            *i, *j;
4318   PetscContainer       container_h;
4319   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4320 
4321   PetscFunctionBegin;
4322   PetscCall(PetscGetMemType(coo_i, &mtype));
4323   if (PetscMemTypeDevice(mtype)) {
4324     dev_ij = PETSC_TRUE;
4325     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4326     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4327     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4328   } else {
4329     i = coo_i;
4330     j = coo_j;
4331   }
4332 
4333   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4334   if (dev_ij) PetscCall(PetscFree2(i, j));
4335   mat->offloadmask = PETSC_OFFLOAD_CPU;
4336   // Create the GPU memory
4337   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4338 
4339   // Copy the COO struct to device
4340   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4341   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4342   PetscCall(PetscMalloc1(1, &coo_d));
4343   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4344   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4345   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4346   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4347   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4348 
4349   // Put the COO struct in a container and then attach that to the matrix
4350   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4351   PetscFunctionReturn(PETSC_SUCCESS);
4352 }
4353 
4354 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4355 {
4356   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4357   const PetscCount grid_size = gridDim.x * blockDim.x;
4358   for (; i < nnz; i += grid_size) {
4359     PetscScalar sum = 0.0;
4360     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4361     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4362   }
4363 }
4364 
4365 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4366 {
4367   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4368   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4369   PetscCount           Annz = seq->nz;
4370   PetscMemType         memtype;
4371   const PetscScalar   *v1 = v;
4372   PetscScalar         *Aa;
4373   PetscContainer       container;
4374   MatCOOStruct_SeqAIJ *coo;
4375 
4376   PetscFunctionBegin;
4377   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4378 
4379   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4380   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4381 
4382   PetscCall(PetscGetMemType(v, &memtype));
4383   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4384     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4385     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4386   }
4387 
4388   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4389   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4390 
4391   PetscCall(PetscLogGpuTimeBegin());
4392   if (Annz) {
4393     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4394     PetscCallCUDA(cudaPeekAtLastError());
4395   }
4396   PetscCall(PetscLogGpuTimeEnd());
4397 
4398   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4399   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4400 
4401   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4402   PetscFunctionReturn(PETSC_SUCCESS);
4403 }
4404 
4405 /*@C
4406   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4407 
4408   Not Collective
4409 
4410   Input Parameters:
4411 + A          - the matrix
4412 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4413 
4414   Output Parameters:
4415 + i - the CSR row pointers
4416 - j - the CSR column indices
4417 
4418   Level: developer
4419 
4420   Note:
4421   When compressed is true, the CSR structure does not contain empty rows
4422 
4423 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4424 @*/
4425 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4426 {
4427   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428   CsrMatrix          *csr;
4429   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4430 
4431   PetscFunctionBegin;
4432   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4433   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4434   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4435   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4436   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4437   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4438   csr = (CsrMatrix *)cusp->mat->mat;
4439   if (i) {
4440     if (!compressed && a->compressedrow.use) { /* need full row offset */
4441       if (!cusp->rowoffsets_gpu) {
4442         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4443         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4444         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4445       }
4446       *i = cusp->rowoffsets_gpu->data().get();
4447     } else *i = csr->row_offsets->data().get();
4448   }
4449   if (j) *j = csr->column_indices->data().get();
4450   PetscFunctionReturn(PETSC_SUCCESS);
4451 }
4452 
4453 /*@C
4454   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4455 
4456   Not Collective
4457 
4458   Input Parameters:
4459 + A          - the matrix
4460 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4461 . i          - the CSR row pointers
4462 - j          - the CSR column indices
4463 
4464   Level: developer
4465 
4466 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4467 @*/
4468 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4469 {
4470   PetscFunctionBegin;
4471   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4472   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4473   if (i) *i = NULL;
4474   if (j) *j = NULL;
4475   (void)compressed;
4476   PetscFunctionReturn(PETSC_SUCCESS);
4477 }
4478 
4479 /*@C
4480   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4481 
4482   Not Collective
4483 
4484   Input Parameter:
4485 . A - a `MATSEQAIJCUSPARSE` matrix
4486 
4487   Output Parameter:
4488 . a - pointer to the device data
4489 
4490   Level: developer
4491 
4492   Note:
4493   May trigger host-device copies if up-to-date matrix data is on host
4494 
4495 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4496 @*/
4497 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4498 {
4499   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4500   CsrMatrix          *csr;
4501 
4502   PetscFunctionBegin;
4503   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4504   PetscAssertPointer(a, 2);
4505   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4506   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4507   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4508   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4509   csr = (CsrMatrix *)cusp->mat->mat;
4510   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4511   *a = csr->values->data().get();
4512   PetscFunctionReturn(PETSC_SUCCESS);
4513 }
4514 
4515 /*@C
4516   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4517 
4518   Not Collective
4519 
4520   Input Parameters:
4521 + A - a `MATSEQAIJCUSPARSE` matrix
4522 - a - pointer to the device data
4523 
4524   Level: developer
4525 
4526 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4527 @*/
4528 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4529 {
4530   PetscFunctionBegin;
4531   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4532   PetscAssertPointer(a, 2);
4533   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4534   *a = NULL;
4535   PetscFunctionReturn(PETSC_SUCCESS);
4536 }
4537 
4538 /*@C
4539   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4540 
4541   Not Collective
4542 
4543   Input Parameter:
4544 . A - a `MATSEQAIJCUSPARSE` matrix
4545 
4546   Output Parameter:
4547 . a - pointer to the device data
4548 
4549   Level: developer
4550 
4551   Note:
4552   May trigger host-device copies if up-to-date matrix data is on host
4553 
4554 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4555 @*/
4556 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4557 {
4558   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559   CsrMatrix          *csr;
4560 
4561   PetscFunctionBegin;
4562   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4563   PetscAssertPointer(a, 2);
4564   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4566   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4567   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568   csr = (CsrMatrix *)cusp->mat->mat;
4569   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570   *a             = csr->values->data().get();
4571   A->offloadmask = PETSC_OFFLOAD_GPU;
4572   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4573   PetscFunctionReturn(PETSC_SUCCESS);
4574 }
4575 /*@C
4576   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4577 
4578   Not Collective
4579 
4580   Input Parameters:
4581 + A - a `MATSEQAIJCUSPARSE` matrix
4582 - a - pointer to the device data
4583 
4584   Level: developer
4585 
4586 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4587 @*/
4588 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4589 {
4590   PetscFunctionBegin;
4591   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4592   PetscAssertPointer(a, 2);
4593   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4594   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4595   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4596   *a = NULL;
4597   PetscFunctionReturn(PETSC_SUCCESS);
4598 }
4599 
4600 /*@C
4601   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4602 
4603   Not Collective
4604 
4605   Input Parameter:
4606 . A - a `MATSEQAIJCUSPARSE` matrix
4607 
4608   Output Parameter:
4609 . a - pointer to the device data
4610 
4611   Level: developer
4612 
4613   Note:
4614   Does not trigger host-device copies and flags data validity on the GPU
4615 
4616 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4617 @*/
4618 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4619 {
4620   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4621   CsrMatrix          *csr;
4622 
4623   PetscFunctionBegin;
4624   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4625   PetscAssertPointer(a, 2);
4626   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4627   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4628   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4629   csr = (CsrMatrix *)cusp->mat->mat;
4630   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4631   *a             = csr->values->data().get();
4632   A->offloadmask = PETSC_OFFLOAD_GPU;
4633   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4634   PetscFunctionReturn(PETSC_SUCCESS);
4635 }
4636 
4637 /*@C
4638   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4639 
4640   Not Collective
4641 
4642   Input Parameters:
4643 + A - a `MATSEQAIJCUSPARSE` matrix
4644 - a - pointer to the device data
4645 
4646   Level: developer
4647 
4648 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4649 @*/
4650 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4651 {
4652   PetscFunctionBegin;
4653   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4654   PetscAssertPointer(a, 2);
4655   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4656   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4657   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4658   *a = NULL;
4659   PetscFunctionReturn(PETSC_SUCCESS);
4660 }
4661 
4662 struct IJCompare4 {
4663   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4664   {
4665     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4666     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4667     return false;
4668   }
4669 };
4670 
4671 struct Shift {
4672   int _shift;
4673 
4674   Shift(int shift) : _shift(shift) { }
4675   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4676 };
4677 
4678 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4679 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4680 {
4681   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4682   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4683   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4684   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4685   PetscInt                      Annz, Bnnz;
4686   cusparseStatus_t              stat;
4687   PetscInt                      i, m, n, zero = 0;
4688 
4689   PetscFunctionBegin;
4690   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4691   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4692   PetscAssertPointer(C, 4);
4693   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4694   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4695   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4696   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4697   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4698   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4699   if (reuse == MAT_INITIAL_MATRIX) {
4700     m = A->rmap->n;
4701     n = A->cmap->n + B->cmap->n;
4702     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4703     PetscCall(MatSetSizes(*C, m, n, m, n));
4704     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4705     c                       = (Mat_SeqAIJ *)(*C)->data;
4706     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4707     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4708     Ccsr                    = new CsrMatrix;
4709     Cmat->cprowIndices      = NULL;
4710     c->compressedrow.use    = PETSC_FALSE;
4711     c->compressedrow.nrows  = 0;
4712     c->compressedrow.i      = NULL;
4713     c->compressedrow.rindex = NULL;
4714     Ccusp->workVector       = NULL;
4715     Ccusp->nrows            = m;
4716     Ccusp->mat              = Cmat;
4717     Ccusp->mat->mat         = Ccsr;
4718     Ccsr->num_rows          = m;
4719     Ccsr->num_cols          = n;
4720     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4721     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4722     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4723     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4724     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4725     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4726     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4727     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4728     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4729     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4730     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4731     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4732     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4733 
4734     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4735     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4736     Annz                 = (PetscInt)Acsr->column_indices->size();
4737     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4738     c->nz                = Annz + Bnnz;
4739     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4740     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4741     Ccsr->values         = new THRUSTARRAY(c->nz);
4742     Ccsr->num_entries    = c->nz;
4743     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4744     if (c->nz) {
4745       auto              Acoo = new THRUSTINTARRAY32(Annz);
4746       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4747       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4748       THRUSTINTARRAY32 *Aroff, *Broff;
4749 
4750       if (a->compressedrow.use) { /* need full row offset */
4751         if (!Acusp->rowoffsets_gpu) {
4752           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4753           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4754           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4755         }
4756         Aroff = Acusp->rowoffsets_gpu;
4757       } else Aroff = Acsr->row_offsets;
4758       if (b->compressedrow.use) { /* need full row offset */
4759         if (!Bcusp->rowoffsets_gpu) {
4760           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4761           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4762           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4763         }
4764         Broff = Bcusp->rowoffsets_gpu;
4765       } else Broff = Bcsr->row_offsets;
4766       PetscCall(PetscLogGpuTimeBegin());
4767       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4768       PetscCallCUSPARSE(stat);
4769       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4770       PetscCallCUSPARSE(stat);
4771       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4772       auto Aperm = thrust::make_constant_iterator(1);
4773       auto Bperm = thrust::make_constant_iterator(0);
4774 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4775       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4776       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4777 #else
4778       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4779       auto Bcib = Bcsr->column_indices->begin();
4780       auto Bcie = Bcsr->column_indices->end();
4781       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4782 #endif
4783       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4784       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4785       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4786       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4787       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4788       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4789       auto p1    = Ccusp->coords->begin();
4790       auto p2    = Ccusp->coords->begin();
4791       thrust::advance(p2, Annz);
4792       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4793 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4794       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4795 #endif
4796       auto cci = thrust::make_counting_iterator(zero);
4797       auto cce = thrust::make_counting_iterator(c->nz);
4798 #if 0 //Errors on SUMMIT cuda 11.1.0
4799       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4800 #else
4801       auto pred = thrust::identity<int>();
4802       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4803       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4804 #endif
4805       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4806       PetscCallCUSPARSE(stat);
4807       PetscCall(PetscLogGpuTimeEnd());
4808       delete wPerm;
4809       delete Acoo;
4810       delete Bcoo;
4811       delete Ccoo;
4812 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4813       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4814       PetscCallCUSPARSE(stat);
4815 #endif
4816       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4817         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4818         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4819         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4820         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4821         CsrMatrix                    *CcsrT = new CsrMatrix;
4822         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4823         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4824 
4825         (*C)->form_explicit_transpose = PETSC_TRUE;
4826         (*C)->transupdated            = PETSC_TRUE;
4827         Ccusp->rowoffsets_gpu         = NULL;
4828         CmatT->cprowIndices           = NULL;
4829         CmatT->mat                    = CcsrT;
4830         CcsrT->num_rows               = n;
4831         CcsrT->num_cols               = m;
4832         CcsrT->num_entries            = c->nz;
4833 
4834         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4835         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4836         CcsrT->values         = new THRUSTARRAY(c->nz);
4837 
4838         PetscCall(PetscLogGpuTimeBegin());
4839         auto rT = CcsrT->row_offsets->begin();
4840         if (AT) {
4841           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4842           thrust::advance(rT, -1);
4843         }
4844         if (BT) {
4845           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4846           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4847           thrust::copy(titb, tite, rT);
4848         }
4849         auto cT = CcsrT->column_indices->begin();
4850         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4851         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4852         auto vT = CcsrT->values->begin();
4853         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4854         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4855         PetscCall(PetscLogGpuTimeEnd());
4856 
4857         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4858         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4859         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4860         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4861         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4862         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4863         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4864         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4865         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4866 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4867         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4868         PetscCallCUSPARSE(stat);
4869 #endif
4870         Ccusp->matTranspose = CmatT;
4871       }
4872     }
4873 
4874     c->free_a = PETSC_TRUE;
4875     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4876     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4877     c->free_ij = PETSC_TRUE;
4878     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4879       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4880       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4881       ii = *Ccsr->row_offsets;
4882       jj = *Ccsr->column_indices;
4883       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4884       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4885     } else {
4886       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4887       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4888     }
4889     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4890     PetscCall(PetscMalloc1(m, &c->ilen));
4891     PetscCall(PetscMalloc1(m, &c->imax));
4892     c->maxnz         = c->nz;
4893     c->nonzerorowcnt = 0;
4894     c->rmax          = 0;
4895     for (i = 0; i < m; i++) {
4896       const PetscInt nn = c->i[i + 1] - c->i[i];
4897       c->ilen[i] = c->imax[i] = nn;
4898       c->nonzerorowcnt += (PetscInt)!!nn;
4899       c->rmax = PetscMax(c->rmax, nn);
4900     }
4901     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4902     PetscCall(PetscMalloc1(c->nz, &c->a));
4903     (*C)->nonzerostate++;
4904     PetscCall(PetscLayoutSetUp((*C)->rmap));
4905     PetscCall(PetscLayoutSetUp((*C)->cmap));
4906     Ccusp->nonzerostate = (*C)->nonzerostate;
4907     (*C)->preallocated  = PETSC_TRUE;
4908   } else {
4909     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4910     c = (Mat_SeqAIJ *)(*C)->data;
4911     if (c->nz) {
4912       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4913       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4914       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4915       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4916       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4917       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4918       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4919       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4920       Acsr = (CsrMatrix *)Acusp->mat->mat;
4921       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4922       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4923       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4924       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4925       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4926       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4927       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4928       auto pmid = Ccusp->coords->begin();
4929       thrust::advance(pmid, Acsr->num_entries);
4930       PetscCall(PetscLogGpuTimeBegin());
4931       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4932       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4933       thrust::for_each(zibait, zieait, VecCUDAEquals());
4934       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4935       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4936       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4937       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4938       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4939         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4940         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4941         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4942         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4943         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4944         auto       vT    = CcsrT->values->begin();
4945         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4946         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4947         (*C)->transupdated = PETSC_TRUE;
4948       }
4949       PetscCall(PetscLogGpuTimeEnd());
4950     }
4951   }
4952   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4953   (*C)->assembled     = PETSC_TRUE;
4954   (*C)->was_assembled = PETSC_FALSE;
4955   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4956   PetscFunctionReturn(PETSC_SUCCESS);
4957 }
4958 
4959 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4960 {
4961   bool               dmem;
4962   const PetscScalar *av;
4963 
4964   PetscFunctionBegin;
4965   dmem = isCudaMem(v);
4966   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4967   if (n && idx) {
4968     THRUSTINTARRAY widx(n);
4969     widx.assign(idx, idx + n);
4970     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4971 
4972     THRUSTARRAY                    *w = NULL;
4973     thrust::device_ptr<PetscScalar> dv;
4974     if (dmem) {
4975       dv = thrust::device_pointer_cast(v);
4976     } else {
4977       w  = new THRUSTARRAY(n);
4978       dv = w->data();
4979     }
4980     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4981 
4982     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4983     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4984     thrust::for_each(zibit, zieit, VecCUDAEquals());
4985     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4986     delete w;
4987   } else {
4988     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4989   }
4990   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4991   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4992   PetscFunctionReturn(PETSC_SUCCESS);
4993 }
4994 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
4995