xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 3201ab8d70a3eea669f338f8bd746342de1cf5db)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73 #endif
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
89 
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
92 
93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96 
97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98 {
99   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
100 
101   PetscFunctionBegin;
102   switch (op) {
103   case MAT_CUSPARSE_MULT:
104     cusparsestruct->format = format;
105     break;
106   case MAT_CUSPARSE_ALL:
107     cusparsestruct->format = format;
108     break;
109   default:
110     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111   }
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
115 /*@
116   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
117   operation. Only the `MatMult()` operation can use different GPU storage formats
118 
119   Not Collective
120 
121   Input Parameters:
122 + A      - Matrix of type `MATSEQAIJCUSPARSE`
123 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
124         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126 
127   Level: intermediate
128 
129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130 @*/
131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132 {
133   PetscFunctionBegin;
134   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140 {
141   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142 
143   PetscFunctionBegin;
144   cusparsestruct->use_cpu_solve = use_cpu;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150 
151   Input Parameters:
152 + A       - Matrix of type `MATSEQAIJCUSPARSE`
153 - use_cpu - set flag for using the built-in CPU `MatSolve()`
154 
155   Level: intermediate
156 
157   Note:
158   The cuSparse LU solver currently computes the factors with the built-in CPU method
159   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161 
162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163 @*/
164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165 {
166   PetscFunctionBegin;
167   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173 {
174   PetscFunctionBegin;
175   switch (op) {
176   case MAT_FORM_EXPLICIT_TRANSPOSE:
177     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
178     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
179     A->form_explicit_transpose = flg;
180     break;
181   default:
182     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183     break;
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
189 {
190   MatCUSPARSEStorageFormat format;
191   PetscBool                flg;
192   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
193 
194   PetscFunctionBegin;
195   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
196   if (A->factortype == MAT_FACTOR_NONE) {
197     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
198     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199 
200     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
201     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
202     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
203     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
205     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209   #else
210     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211   #endif
212     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214 
215     PetscCall(
216       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218 #endif
219   }
220   PetscOptionsHeadEnd();
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226 {
227   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228   PetscInt                      m  = A->rmap->n;
229   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231   const MatScalar              *Aa = a->a;
232   PetscInt                     *Mi, *Mj, Mnz;
233   PetscScalar                  *Ma;
234 
235   PetscFunctionBegin;
236   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240       PetscCall(PetscMalloc1(m + 1, &Mi));
241       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242       PetscCall(PetscMalloc1(Mnz, &Ma));
243       Mi[0] = 0;
244       for (PetscInt i = 0; i < m; i++) {
245         PetscInt llen = Ai[i + 1] - Ai[i];
246         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250         Mi[i + 1] = Mi[i] + llen + ulen;
251       }
252       // Copy M (L,U) from host to device
253       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258 
259       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267 
268       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271 
272       fillMode = CUSPARSE_FILL_MODE_UPPER;
273       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277 
278       // Allocate work vectors in SpSv
279       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281 
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284 
285       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292 
293       // Record for reuse
294       fs->csrRowPtr_h = Mi;
295       fs->csrVal_h    = Ma;
296       PetscCall(PetscFree(Mj));
297     }
298     // Copy the value
299     Mi  = fs->csrRowPtr_h;
300     Ma  = fs->csrVal_h;
301     Mnz = Mi[m];
302     for (PetscInt i = 0; i < m; i++) {
303       PetscInt llen = Ai[i + 1] - Ai[i];
304       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308     }
309     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310 
311     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
312     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
313 
314     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
315 
316     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
317     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
318   }
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 #else
322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
323 {
324   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
325   PetscInt                           n                  = A->rmap->n;
326   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
327   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
328   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
329   const MatScalar                   *aa = a->a, *v;
330   PetscInt                          *AiLo, *AjLo;
331   PetscInt                           i, nz, nzLower, offset, rowOffset;
332 
333   PetscFunctionBegin;
334   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
335   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
336     try {
337       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
338       nzLower = n + ai[n] - ai[1];
339       if (!loTriFactor) {
340         PetscScalar *AALo;
341 
342         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
343 
344         /* Allocate Space for the lower triangular matrix */
345         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
346         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
347 
348         /* Fill the lower triangular matrix */
349         AiLo[0]   = (PetscInt)0;
350         AiLo[n]   = nzLower;
351         AjLo[0]   = (PetscInt)0;
352         AALo[0]   = (MatScalar)1.0;
353         v         = aa;
354         vi        = aj;
355         offset    = 1;
356         rowOffset = 1;
357         for (i = 1; i < n; i++) {
358           nz = ai[i + 1] - ai[i];
359           /* additional 1 for the term on the diagonal */
360           AiLo[i] = rowOffset;
361           rowOffset += nz + 1;
362 
363           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
364           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
365 
366           offset += nz;
367           AjLo[offset] = (PetscInt)i;
368           AALo[offset] = (MatScalar)1.0;
369           offset += 1;
370 
371           v += nz;
372           vi += nz;
373         }
374 
375         /* allocate space for the triangular factor information */
376         PetscCall(PetscNew(&loTriFactor));
377         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
378         /* Create the matrix description */
379         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
380         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
381   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
382         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
383   #else
384         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
385   #endif
386         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
387         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
388 
389         /* set the operation */
390         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
391 
392         /* set the matrix */
393         loTriFactor->csrMat              = new CsrMatrix;
394         loTriFactor->csrMat->num_rows    = n;
395         loTriFactor->csrMat->num_cols    = n;
396         loTriFactor->csrMat->num_entries = nzLower;
397 
398         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
399         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
400 
401         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
402         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
403 
404         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
405         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
406 
407         /* Create the solve analysis information */
408         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
409         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
410   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
411         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
412                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
413         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
414   #endif
415 
416         /* perform the solve analysis */
417         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
418                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
419         PetscCallCUDA(WaitForCUDA());
420         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
421 
422         /* assign the pointer */
423         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
424         loTriFactor->AA_h                                          = AALo;
425         PetscCallCUDA(cudaFreeHost(AiLo));
426         PetscCallCUDA(cudaFreeHost(AjLo));
427         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
428       } else { /* update values only */
429         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
430         /* Fill the lower triangular matrix */
431         loTriFactor->AA_h[0] = 1.0;
432         v                    = aa;
433         vi                   = aj;
434         offset               = 1;
435         for (i = 1; i < n; i++) {
436           nz = ai[i + 1] - ai[i];
437           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
438           offset += nz;
439           loTriFactor->AA_h[offset] = 1.0;
440           offset += 1;
441           v += nz;
442         }
443         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
444         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
445       }
446     } catch (char *ex) {
447       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
448     }
449   }
450   PetscFunctionReturn(PETSC_SUCCESS);
451 }
452 
453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
454 {
455   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
456   PetscInt                           n                  = A->rmap->n;
457   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
458   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
459   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
460   const MatScalar                   *aa = a->a, *v;
461   PetscInt                          *AiUp, *AjUp;
462   PetscInt                           i, nz, nzUpper, offset;
463 
464   PetscFunctionBegin;
465   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
466   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
467     try {
468       /* next, figure out the number of nonzeros in the upper triangular matrix. */
469       nzUpper = adiag[0] - adiag[n];
470       if (!upTriFactor) {
471         PetscScalar *AAUp;
472 
473         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
474 
475         /* Allocate Space for the upper triangular matrix */
476         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
477         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
478 
479         /* Fill the upper triangular matrix */
480         AiUp[0] = (PetscInt)0;
481         AiUp[n] = nzUpper;
482         offset  = nzUpper;
483         for (i = n - 1; i >= 0; i--) {
484           v  = aa + adiag[i + 1] + 1;
485           vi = aj + adiag[i + 1] + 1;
486 
487           /* number of elements NOT on the diagonal */
488           nz = adiag[i] - adiag[i + 1] - 1;
489 
490           /* decrement the offset */
491           offset -= (nz + 1);
492 
493           /* first, set the diagonal elements */
494           AjUp[offset] = (PetscInt)i;
495           AAUp[offset] = (MatScalar)1. / v[nz];
496           AiUp[i]      = AiUp[i + 1] - (nz + 1);
497 
498           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
499           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
500         }
501 
502         /* allocate space for the triangular factor information */
503         PetscCall(PetscNew(&upTriFactor));
504         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
505 
506         /* Create the matrix description */
507         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
508         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
509   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
510         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
511   #else
512         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
513   #endif
514         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
515         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516 
517         /* set the operation */
518         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519 
520         /* set the matrix */
521         upTriFactor->csrMat              = new CsrMatrix;
522         upTriFactor->csrMat->num_rows    = n;
523         upTriFactor->csrMat->num_cols    = n;
524         upTriFactor->csrMat->num_entries = nzUpper;
525 
526         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
527         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
528 
529         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
530         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
531 
532         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
533         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
534 
535         /* Create the solve analysis information */
536         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
537         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
538   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
539         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
540                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
541         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
542   #endif
543 
544         /* perform the solve analysis */
545         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
546                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
547 
548         PetscCallCUDA(WaitForCUDA());
549         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
550 
551         /* assign the pointer */
552         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
553         upTriFactor->AA_h                                          = AAUp;
554         PetscCallCUDA(cudaFreeHost(AiUp));
555         PetscCallCUDA(cudaFreeHost(AjUp));
556         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
557       } else {
558         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
559         /* Fill the upper triangular matrix */
560         offset = nzUpper;
561         for (i = n - 1; i >= 0; i--) {
562           v = aa + adiag[i + 1] + 1;
563 
564           /* number of elements NOT on the diagonal */
565           nz = adiag[i] - adiag[i + 1] - 1;
566 
567           /* decrement the offset */
568           offset -= (nz + 1);
569 
570           /* first, set the diagonal elements */
571           upTriFactor->AA_h[offset] = 1. / v[nz];
572           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
573         }
574         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
575         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
576       }
577     } catch (char *ex) {
578       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
579     }
580   }
581   PetscFunctionReturn(PETSC_SUCCESS);
582 }
583 #endif
584 
585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
586 {
587   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
588   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
589   IS                            isrow = a->row, iscol = a->icol;
590   PetscBool                     row_identity, col_identity;
591   PetscInt                      n = A->rmap->n;
592 
593   PetscFunctionBegin;
594   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
596   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
597 #else
598   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
599   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
600   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
601 #endif
602 
603   cusparseTriFactors->nnz = a->nz;
604 
605   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
606   /* lower triangular indices */
607   PetscCall(ISIdentity(isrow, &row_identity));
608   if (!row_identity && !cusparseTriFactors->rpermIndices) {
609     const PetscInt *r;
610 
611     PetscCall(ISGetIndices(isrow, &r));
612     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
613     cusparseTriFactors->rpermIndices->assign(r, r + n);
614     PetscCall(ISRestoreIndices(isrow, &r));
615     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
616   }
617 
618   /* upper triangular indices */
619   PetscCall(ISIdentity(iscol, &col_identity));
620   if (!col_identity && !cusparseTriFactors->cpermIndices) {
621     const PetscInt *c;
622 
623     PetscCall(ISGetIndices(iscol, &c));
624     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
625     cusparseTriFactors->cpermIndices->assign(c, c + n);
626     PetscCall(ISRestoreIndices(iscol, &c));
627     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
628   }
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
634 {
635   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
636   PetscInt                      m  = A->rmap->n;
637   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
638   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
639   const MatScalar              *Aa = a->a;
640   PetscInt                     *Mj, Mnz;
641   PetscScalar                  *Ma, *D;
642 
643   PetscFunctionBegin;
644   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
645     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
646       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
647       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
648       Mnz = Ai[m]; // Unz (with the unit diagonal)
649       PetscCall(PetscMalloc1(Mnz, &Ma));
650       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
651       PetscCall(PetscMalloc1(m, &D));    // the diagonal
652       for (PetscInt i = 0; i < m; i++) {
653         PetscInt ulen = Ai[i + 1] - Ai[i];
654         Mj[Ai[i]]     = i;                                              // diagonal entry
655         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
656       }
657       // Copy M (U) from host to device
658       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
659       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
661       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
662       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
663       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
664 
665       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
666       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
667       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
668       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
669       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
670       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
671       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
672       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
673 
674       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
676       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
677 
678       // Allocate work vectors in SpSv
679       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
680       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
681 
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
683       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
684 
685       // Query buffer sizes for SpSV and then allocate buffers
686       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
687       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
688       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
689 
690       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
691       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
692       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
693 
694       // Record for reuse
695       fs->csrVal_h = Ma;
696       fs->diag_h   = D;
697       PetscCall(PetscFree(Mj));
698     }
699     // Copy the value
700     Ma  = fs->csrVal_h;
701     D   = fs->diag_h;
702     Mnz = Ai[m];
703     for (PetscInt i = 0; i < m; i++) {
704       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
705       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
706       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
707     }
708     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
709     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
710 
711     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
713     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
714   }
715   PetscFunctionReturn(PETSC_SUCCESS);
716 }
717 
718 // Solve Ut D U x = b
719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
720 {
721   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
722   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
723   const PetscScalar                    *barray;
724   PetscScalar                          *xarray;
725   thrust::device_ptr<const PetscScalar> bGPU;
726   thrust::device_ptr<PetscScalar>       xGPU;
727   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
728   PetscInt                              m   = A->rmap->n;
729 
730   PetscFunctionBegin;
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
733   PetscCall(VecCUDAGetArrayRead(b, &barray));
734   xGPU = thrust::device_pointer_cast(xarray);
735   bGPU = thrust::device_pointer_cast(barray);
736 
737   // Reorder b with the row permutation if needed, and wrap the result in fs->X
738   if (fs->rpermIndices) {
739     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
740     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
741   } else {
742     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
743   }
744 
745   // Solve Ut Y = X
746   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
747   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
748 
749   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
750   // It is basically a vector element-wise multiplication, but cublas does not have it!
751   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
752 
753   // Solve U X = Y
754   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756   } else {
757     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758   }
759   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
760 
761   // Reorder X with the column permutation if needed, and put the result back to x
762   if (fs->cpermIndices) {
763     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765   }
766 
767   PetscCall(VecCUDARestoreArrayRead(b, &barray));
768   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769   PetscCall(PetscLogGpuTimeEnd());
770   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 #else
774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775 {
776   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
777   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780   PetscInt                          *AiUp, *AjUp;
781   PetscScalar                       *AAUp;
782   PetscScalar                       *AALo;
783   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
785   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
786   const MatScalar                   *aa = b->a, *v;
787 
788   PetscFunctionBegin;
789   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791     try {
792       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
793       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794       if (!upTriFactor && !loTriFactor) {
795         /* Allocate Space for the upper triangular matrix */
796         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
797         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
798 
799         /* Fill the upper triangular matrix */
800         AiUp[0] = (PetscInt)0;
801         AiUp[n] = nzUpper;
802         offset  = 0;
803         for (i = 0; i < n; i++) {
804           /* set the pointers */
805           v  = aa + ai[i];
806           vj = aj + ai[i];
807           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
808 
809           /* first, set the diagonal elements */
810           AjUp[offset] = (PetscInt)i;
811           AAUp[offset] = (MatScalar)1.0 / v[nz];
812           AiUp[i]      = offset;
813           AALo[offset] = (MatScalar)1.0 / v[nz];
814 
815           offset += 1;
816           if (nz > 0) {
817             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819             for (j = offset; j < offset + nz; j++) {
820               AAUp[j] = -AAUp[j];
821               AALo[j] = AAUp[j] / v[nz];
822             }
823             offset += nz;
824           }
825         }
826 
827         /* allocate space for the triangular factor information */
828         PetscCall(PetscNew(&upTriFactor));
829         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
830 
831         /* Create the matrix description */
832         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
833         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
834   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
835         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836   #else
837         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838   #endif
839         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
840         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
841 
842         /* set the matrix */
843         upTriFactor->csrMat              = new CsrMatrix;
844         upTriFactor->csrMat->num_rows    = A->rmap->n;
845         upTriFactor->csrMat->num_cols    = A->cmap->n;
846         upTriFactor->csrMat->num_entries = a->nz;
847 
848         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
850 
851         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
853 
854         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
856 
857         /* set the operation */
858         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859 
860         /* Create the solve analysis information */
861         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
863   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
864         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
865                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
866         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867   #endif
868 
869         /* perform the solve analysis */
870         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
871                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
872 
873         PetscCallCUDA(WaitForCUDA());
874         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
875 
876         /* assign the pointer */
877         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
878 
879         /* allocate space for the triangular factor information */
880         PetscCall(PetscNew(&loTriFactor));
881         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
882 
883         /* Create the matrix description */
884         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
885         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
886   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888   #else
889         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890   #endif
891         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
892         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
893 
894         /* set the operation */
895         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
896 
897         /* set the matrix */
898         loTriFactor->csrMat              = new CsrMatrix;
899         loTriFactor->csrMat->num_rows    = A->rmap->n;
900         loTriFactor->csrMat->num_cols    = A->cmap->n;
901         loTriFactor->csrMat->num_entries = a->nz;
902 
903         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
905 
906         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
908 
909         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
911 
912         /* Create the solve analysis information */
913         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
915   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
916         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
917                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
918         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919   #endif
920 
921         /* perform the solve analysis */
922         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
923                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
924 
925         PetscCallCUDA(WaitForCUDA());
926         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
927 
928         /* assign the pointer */
929         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
930 
931         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
932         PetscCallCUDA(cudaFreeHost(AiUp));
933         PetscCallCUDA(cudaFreeHost(AjUp));
934       } else {
935         /* Fill the upper triangular matrix */
936         offset = 0;
937         for (i = 0; i < n; i++) {
938           /* set the pointers */
939           v  = aa + ai[i];
940           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
941 
942           /* first, set the diagonal elements */
943           AAUp[offset] = 1.0 / v[nz];
944           AALo[offset] = 1.0 / v[nz];
945 
946           offset += 1;
947           if (nz > 0) {
948             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949             for (j = offset; j < offset + nz; j++) {
950               AAUp[j] = -AAUp[j];
951               AALo[j] = AAUp[j] / v[nz];
952             }
953             offset += nz;
954           }
955         }
956         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
960         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961       }
962       PetscCallCUDA(cudaFreeHost(AAUp));
963       PetscCallCUDA(cudaFreeHost(AALo));
964     } catch (char *ex) {
965       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966     }
967   }
968   PetscFunctionReturn(PETSC_SUCCESS);
969 }
970 #endif
971 
972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973 {
974   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
975   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976   IS                            ip                 = a->row;
977   PetscBool                     perm_identity;
978   PetscInt                      n = A->rmap->n;
979 
980   PetscFunctionBegin;
981   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
982 
983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
985 #else
986   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988 #endif
989   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
990 
991   A->offloadmask = PETSC_OFFLOAD_BOTH;
992 
993   /* lower triangular indices */
994   PetscCall(ISIdentity(ip, &perm_identity));
995   if (!perm_identity) {
996     IS              iip;
997     const PetscInt *irip, *rip;
998 
999     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1000     PetscCall(ISGetIndices(iip, &irip));
1001     PetscCall(ISGetIndices(ip, &rip));
1002     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1005     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1006     PetscCall(ISRestoreIndices(iip, &irip));
1007     PetscCall(ISDestroy(&iip));
1008     PetscCall(ISRestoreIndices(ip, &rip));
1009     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010   }
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015 {
1016   PetscFunctionBegin;
1017   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1018   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019   B->offloadmask = PETSC_OFFLOAD_CPU;
1020 
1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024 #else
1025   /* determine which version of MatSolve needs to be used. */
1026   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1027   IS          ip = b->row;
1028   PetscBool   perm_identity;
1029 
1030   PetscCall(ISIdentity(ip, &perm_identity));
1031   if (perm_identity) {
1032     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034   } else {
1035     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1036     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037   }
1038 #endif
1039   B->ops->matsolve          = NULL;
1040   B->ops->matsolvetranspose = NULL;
1041 
1042   /* get the triangular factors */
1043   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1044   PetscFunctionReturn(PETSC_SUCCESS);
1045 }
1046 
1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049 {
1050   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055   cusparseIndexBase_t                indexBase;
1056   cusparseMatrixType_t               matrixType;
1057   cusparseFillMode_t                 fillMode;
1058   cusparseDiagType_t                 diagType;
1059 
1060   PetscFunctionBegin;
1061   /* allocate space for the transpose of the lower triangular factor */
1062   PetscCall(PetscNew(&loTriFactorT));
1063   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1064 
1065   /* set the matrix descriptors of the lower triangular factor */
1066   matrixType = cusparseGetMatType(loTriFactor->descr);
1067   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1068   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1070 
1071   /* Create the matrix description */
1072   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1073   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1074   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1075   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1076   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1077 
1078   /* set the operation */
1079   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1080 
1081   /* allocate GPU space for the CSC of the lower triangular factor*/
1082   loTriFactorT->csrMat                 = new CsrMatrix;
1083   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1084   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1085   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1086   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1089 
1090   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1092   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1093                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1095   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096   #endif
1097 
1098   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1099   {
1100     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1101     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1102                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1104                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105   #else
1106                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107   #endif
1108     PetscCallCUSPARSE(stat);
1109   }
1110 
1111   PetscCallCUDA(WaitForCUDA());
1112   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1113 
1114   /* Create the solve analysis information */
1115   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1117   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1118   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1119                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1120   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121   #endif
1122 
1123   /* perform the solve analysis */
1124   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1125                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1126 
1127   PetscCallCUDA(WaitForCUDA());
1128   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1129 
1130   /* assign the pointer */
1131   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1132 
1133   /*********************************************/
1134   /* Now the Transpose of the Upper Tri Factor */
1135   /*********************************************/
1136 
1137   /* allocate space for the transpose of the upper triangular factor */
1138   PetscCall(PetscNew(&upTriFactorT));
1139   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1140 
1141   /* set the matrix descriptors of the upper triangular factor */
1142   matrixType = cusparseGetMatType(upTriFactor->descr);
1143   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1144   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1146 
1147   /* Create the matrix description */
1148   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1149   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1150   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1151   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1152   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1153 
1154   /* set the operation */
1155   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1156 
1157   /* allocate GPU space for the CSC of the upper triangular factor*/
1158   upTriFactorT->csrMat                 = new CsrMatrix;
1159   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1160   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1161   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1162   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1165 
1166   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1168   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1169                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1171   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172   #endif
1173 
1174   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1175   {
1176     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1177     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1178                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1180                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181   #else
1182                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183   #endif
1184     PetscCallCUSPARSE(stat);
1185   }
1186 
1187   PetscCallCUDA(WaitForCUDA());
1188   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1189 
1190   /* Create the solve analysis information */
1191   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1193   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1194   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1195                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1196   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197   #endif
1198 
1199   /* perform the solve analysis */
1200   /* christ, would it have killed you to put this stuff in a function????????? */
1201   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1202                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1203 
1204   PetscCallCUDA(WaitForCUDA());
1205   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1206 
1207   /* assign the pointer */
1208   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 #endif
1212 
1213 struct PetscScalarToPetscInt {
1214   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215 };
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1222   cusparseStatus_t              stat;
1223   cusparseIndexBase_t           indexBase;
1224 
1225   PetscFunctionBegin;
1226   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1228   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1230   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1231   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1232   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1233   PetscCall(PetscLogGpuTimeBegin());
1234   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1240     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1241 
1242     /* set alpha and beta */
1243     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1246     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1249 
1250     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251       CsrMatrix *matrixT      = new CsrMatrix;
1252       matstructT->mat         = matrixT;
1253       matrixT->num_rows       = A->cmap->n;
1254       matrixT->num_cols       = A->rmap->n;
1255       matrixT->num_entries    = a->nz;
1256       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258       matrixT->values         = new THRUSTARRAY(a->nz);
1259 
1260       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1261       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1262 
1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1264   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1265       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266                                indexBase, cusparse_scalartype);
1267       PetscCallCUSPARSE(stat);
1268   #else
1269       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1270            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1271 
1272            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1273            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1274            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1275         */
1276       if (matrixT->num_entries) {
1277         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1278         PetscCallCUSPARSE(stat);
1279 
1280       } else {
1281         matstructT->matDescr = NULL;
1282         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1283       }
1284   #endif
1285 #endif
1286     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289 #else
1290       CsrMatrix *temp  = new CsrMatrix;
1291       CsrMatrix *tempT = new CsrMatrix;
1292       /* First convert HYB to CSR */
1293       temp->num_rows       = A->rmap->n;
1294       temp->num_cols       = A->cmap->n;
1295       temp->num_entries    = a->nz;
1296       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1297       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298       temp->values         = new THRUSTARRAY(a->nz);
1299 
1300       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1301       PetscCallCUSPARSE(stat);
1302 
1303       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304       tempT->num_rows       = A->rmap->n;
1305       tempT->num_cols       = A->cmap->n;
1306       tempT->num_entries    = a->nz;
1307       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1308       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309       tempT->values         = new THRUSTARRAY(a->nz);
1310 
1311       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1312                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1313       PetscCallCUSPARSE(stat);
1314 
1315       /* Last, convert CSC to HYB */
1316       cusparseHybMat_t hybMat;
1317       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1319       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1320       PetscCallCUSPARSE(stat);
1321 
1322       /* assign the pointer */
1323       matstructT->mat = hybMat;
1324       A->transupdated = PETSC_TRUE;
1325       /* delete temporaries */
1326       if (tempT) {
1327         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330         delete (CsrMatrix *)tempT;
1331       }
1332       if (temp) {
1333         if (temp->values) delete (THRUSTARRAY *)temp->values;
1334         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336         delete (CsrMatrix *)temp;
1337       }
1338 #endif
1339     }
1340   }
1341   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1343     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1344     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1345     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1346     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1347     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1348     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1349     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1350     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1351     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1355       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356     }
1357     if (!cusparsestruct->csr2csc_i) {
1358       THRUSTARRAY csr2csc_a(matrix->num_entries);
1359       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1360 
1361       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363       void  *csr2cscBuffer;
1364       size_t csr2cscBufferSize;
1365       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1366                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1367       PetscCallCUSPARSE(stat);
1368       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369 #endif
1370 
1371       if (matrix->num_entries) {
1372         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375 
1376            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377            should be filled with indexBase. So I just take a shortcut here.
1378         */
1379         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1382         PetscCallCUSPARSE(stat);
1383 #else
1384                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1385         PetscCallCUSPARSE(stat);
1386 #endif
1387       } else {
1388         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1389       }
1390 
1391       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1394       PetscCallCUDA(cudaFree(csr2cscBuffer));
1395 #endif
1396     }
1397     PetscCallThrust(
1398       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399   }
1400   PetscCall(PetscLogGpuTimeEnd());
1401   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402   /* the compressed row indices is not used for matTranspose */
1403   matstructT->cprowIndices = NULL;
1404   /* assign the pointer */
1405   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1406   A->transupdated                                = PETSC_TRUE;
1407   PetscFunctionReturn(PETSC_SUCCESS);
1408 }
1409 
1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412 {
1413   const PetscScalar                    *barray;
1414   PetscScalar                          *xarray;
1415   thrust::device_ptr<const PetscScalar> bGPU;
1416   thrust::device_ptr<PetscScalar>       xGPU;
1417   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421   PetscInt                              m   = A->rmap->n;
1422 
1423   PetscFunctionBegin;
1424   PetscCall(PetscLogGpuTimeBegin());
1425   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426   PetscCall(VecCUDAGetArrayRead(b, &barray));
1427   xGPU = thrust::device_pointer_cast(xarray);
1428   bGPU = thrust::device_pointer_cast(barray);
1429 
1430   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431   if (fs->rpermIndices) {
1432     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434   } else {
1435     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436   }
1437 
1438   // Solve L Y = X
1439   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1442 
1443   // Solve U X = Y
1444   if (fs->cpermIndices) {
1445     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446   } else {
1447     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448   }
1449   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1450 
1451   // Reorder X with the column permutation if needed, and put the result back to x
1452   if (fs->cpermIndices) {
1453     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455   }
1456   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458   PetscCall(PetscLogGpuTimeEnd());
1459   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464 {
1465   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467   const PetscScalar                    *barray;
1468   PetscScalar                          *xarray;
1469   thrust::device_ptr<const PetscScalar> bGPU;
1470   thrust::device_ptr<PetscScalar>       xGPU;
1471   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1472   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473   PetscInt                              m   = A->rmap->n;
1474 
1475   PetscFunctionBegin;
1476   PetscCall(PetscLogGpuTimeBegin());
1477   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1481 
1482     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487   }
1488 
1489   if (!fs->updatedTransposeSpSVAnalysis) {
1490     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1491 
1492     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494   }
1495 
1496   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497   PetscCall(VecCUDAGetArrayRead(b, &barray));
1498   xGPU = thrust::device_pointer_cast(xarray);
1499   bGPU = thrust::device_pointer_cast(barray);
1500 
1501   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502   if (fs->rpermIndices) {
1503     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505   } else {
1506     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507   }
1508 
1509   // Solve Ut Y = X
1510   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1512 
1513   // Solve Lt X = Y
1514   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516   } else {
1517     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518   }
1519   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1520 
1521   // Reorder X with the column permutation if needed, and put the result back to x
1522   if (fs->cpermIndices) {
1523     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525   }
1526 
1527   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529   PetscCall(PetscLogGpuTimeEnd());
1530   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 #else
1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536 {
1537   PetscInt                              n = xx->map->n;
1538   const PetscScalar                    *barray;
1539   PetscScalar                          *xarray;
1540   thrust::device_ptr<const PetscScalar> bGPU;
1541   thrust::device_ptr<PetscScalar>       xGPU;
1542   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1546 
1547   PetscFunctionBegin;
1548   /* Analyze the matrix and create the transpose ... on the fly */
1549   if (!loTriFactorT && !upTriFactorT) {
1550     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553   }
1554 
1555   /* Get the GPU pointers */
1556   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1557   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558   xGPU = thrust::device_pointer_cast(xarray);
1559   bGPU = thrust::device_pointer_cast(barray);
1560 
1561   PetscCall(PetscLogGpuTimeBegin());
1562   /* First, reorder with the row permutation */
1563   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1564 
1565   /* First, solve U */
1566   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1567                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1568 
1569   /* Then, solve L */
1570   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1571                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1572 
1573   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1575 
1576   /* Copy the temporary to the full solution. */
1577   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1578 
1579   /* restore */
1580   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1581   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1582   PetscCall(PetscLogGpuTimeEnd());
1583   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1584   PetscFunctionReturn(PETSC_SUCCESS);
1585 }
1586 
1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588 {
1589   const PetscScalar                 *barray;
1590   PetscScalar                       *xarray;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1595 
1596   PetscFunctionBegin;
1597   /* Analyze the matrix and create the transpose ... on the fly */
1598   if (!loTriFactorT && !upTriFactorT) {
1599     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602   }
1603 
1604   /* Get the GPU pointers */
1605   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1606   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1607 
1608   PetscCall(PetscLogGpuTimeBegin());
1609   /* First, solve U */
1610   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1611                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1612 
1613   /* Then, solve L */
1614   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1615                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1616 
1617   /* restore */
1618   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1619   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1620   PetscCall(PetscLogGpuTimeEnd());
1621   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1622   PetscFunctionReturn(PETSC_SUCCESS);
1623 }
1624 
1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626 {
1627   const PetscScalar                    *barray;
1628   PetscScalar                          *xarray;
1629   thrust::device_ptr<const PetscScalar> bGPU;
1630   thrust::device_ptr<PetscScalar>       xGPU;
1631   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637   /* Get the GPU pointers */
1638   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1639   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640   xGPU = thrust::device_pointer_cast(xarray);
1641   bGPU = thrust::device_pointer_cast(barray);
1642 
1643   PetscCall(PetscLogGpuTimeBegin());
1644   /* First, reorder with the row permutation */
1645   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1646 
1647   /* Next, solve L */
1648   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1649                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1650 
1651   /* Then, solve U */
1652   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1653                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1654 
1655   /* Last, reorder with the column permutation */
1656   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1657 
1658   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1659   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1660   PetscCall(PetscLogGpuTimeEnd());
1661   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666 {
1667   const PetscScalar                 *barray;
1668   PetscScalar                       *xarray;
1669   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1673 
1674   PetscFunctionBegin;
1675   /* Get the GPU pointers */
1676   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1677   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1678 
1679   PetscCall(PetscLogGpuTimeBegin());
1680   /* First, solve L */
1681   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1682                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1683 
1684   /* Next, solve U */
1685   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1686                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1687 
1688   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1689   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1690   PetscCall(PetscLogGpuTimeEnd());
1691   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1692   PetscFunctionReturn(PETSC_SUCCESS);
1693 }
1694 #endif
1695 
1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698 {
1699   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1701   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702   CsrMatrix                    *Acsr;
1703   PetscInt                      m, nz;
1704   PetscBool                     flg;
1705 
1706   PetscFunctionBegin;
1707   if (PetscDefined(USE_DEBUG)) {
1708     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710   }
1711 
1712   /* Copy A's value to fact */
1713   m  = fact->rmap->n;
1714   nz = aij->nz;
1715   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716   Acsr = (CsrMatrix *)Acusp->mat->mat;
1717   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1718 
1719   PetscCall(PetscLogGpuTimeBegin());
1720   /* Factorize fact inplace */
1721   if (m)
1722     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724   if (PetscDefined(USE_DEBUG)) {
1725     int              numerical_zero;
1726     cusparseStatus_t status;
1727     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729   }
1730 
1731   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1732      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1733   */
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1735 
1736   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1737 
1738   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1739   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740 
1741   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1742   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1743   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1744   fact->ops->matsolve          = NULL;
1745   fact->ops->matsolvetranspose = NULL;
1746   PetscCall(PetscLogGpuTimeEnd());
1747   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1748   PetscFunctionReturn(PETSC_SUCCESS);
1749 }
1750 
1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1752 {
1753   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1754   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1755   PetscInt                      m, nz;
1756 
1757   PetscFunctionBegin;
1758   if (PetscDefined(USE_DEBUG)) {
1759     PetscInt  i;
1760     PetscBool flg, missing;
1761 
1762     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1763     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1764     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1765     PetscCall(MatMissingDiagonal(A, &missing, &i));
1766     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1767   }
1768 
1769   /* Free the old stale stuff */
1770   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1771 
1772   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1773      but they will not be used. Allocate them just for easy debugging.
1774    */
1775   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1776 
1777   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1778   fact->factortype             = MAT_FACTOR_ILU;
1779   fact->info.factor_mallocs    = 0;
1780   fact->info.fill_ratio_given  = info->fill;
1781   fact->info.fill_ratio_needed = 1.0;
1782 
1783   aij->row = NULL;
1784   aij->col = NULL;
1785 
1786   /* ====================================================================== */
1787   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1788   /* We'll do in-place factorization on fact                                */
1789   /* ====================================================================== */
1790   const int *Ai, *Aj;
1791 
1792   m  = fact->rmap->n;
1793   nz = aij->nz;
1794 
1795   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1796   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1797   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1798   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1799   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801 
1802   /* ====================================================================== */
1803   /* Create descriptors for M, L, U                                         */
1804   /* ====================================================================== */
1805   cusparseFillMode_t fillMode;
1806   cusparseDiagType_t diagType;
1807 
1808   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1809   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1810   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1811 
1812   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1813     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1814     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1815     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1816     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1817   */
1818   fillMode = CUSPARSE_FILL_MODE_LOWER;
1819   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1820   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1821   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1822   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1823 
1824   fillMode = CUSPARSE_FILL_MODE_UPPER;
1825   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1826   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1827   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1828   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1829 
1830   /* ========================================================================= */
1831   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1832   /* ========================================================================= */
1833   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1834   if (m)
1835     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1836                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1837 
1838   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1839   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1840 
1841   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1842   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1846 
1847   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1848   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1849 
1850   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1851      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1852      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1853      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1854    */
1855   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1856     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1857     fs->spsvBuffer_L = fs->factBuffer_M;
1858     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1859   } else {
1860     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1861     fs->spsvBuffer_U = fs->factBuffer_M;
1862     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1863   }
1864 
1865   /* ========================================================================== */
1866   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1867   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1868   /* ========================================================================== */
1869   int              structural_zero;
1870   cusparseStatus_t status;
1871 
1872   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1873   if (m)
1874     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1875                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1876   if (PetscDefined(USE_DEBUG)) {
1877     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1878     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1879     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1880   }
1881 
1882   /* Estimate FLOPs of the numeric factorization */
1883   {
1884     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1885     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1886     PetscLogDouble flops = 0.0;
1887 
1888     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1889     Ai    = Aseq->i;
1890     Adiag = Aseq->diag;
1891     for (PetscInt i = 0; i < m; i++) {
1892       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1893         nzRow  = Ai[i + 1] - Ai[i];
1894         nzLeft = Adiag[i] - Ai[i];
1895         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1896           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1897         */
1898         nzLeft = (nzRow - 1) / 2;
1899         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1900       }
1901     }
1902     fs->numericFactFlops = flops;
1903   }
1904   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1905   PetscFunctionReturn(PETSC_SUCCESS);
1906 }
1907 
1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1909 {
1910   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1911   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1912   const PetscScalar            *barray;
1913   PetscScalar                  *xarray;
1914 
1915   PetscFunctionBegin;
1916   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1917   PetscCall(VecCUDAGetArrayRead(b, &barray));
1918   PetscCall(PetscLogGpuTimeBegin());
1919 
1920   /* Solve L*y = b */
1921   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1922   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1923   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1924                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1925 
1926   /* Solve Lt*x = y */
1927   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1928   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1929                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1930 
1931   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1932   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1933 
1934   PetscCall(PetscLogGpuTimeEnd());
1935   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1936   PetscFunctionReturn(PETSC_SUCCESS);
1937 }
1938 
1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1940 {
1941   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1942   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1943   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1944   CsrMatrix                    *Acsr;
1945   PetscInt                      m, nz;
1946   PetscBool                     flg;
1947 
1948   PetscFunctionBegin;
1949   if (PetscDefined(USE_DEBUG)) {
1950     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1952   }
1953 
1954   /* Copy A's value to fact */
1955   m  = fact->rmap->n;
1956   nz = aij->nz;
1957   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1958   Acsr = (CsrMatrix *)Acusp->mat->mat;
1959   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1960 
1961   /* Factorize fact inplace */
1962   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1963      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1964      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1965      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1966      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1967    */
1968   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1969   if (PetscDefined(USE_DEBUG)) {
1970     int              numerical_zero;
1971     cusparseStatus_t status;
1972     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1973     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1974   }
1975 
1976   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1977 
1978   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1979     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1980   */
1981   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1982 
1983   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1984   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1985   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1986   fact->ops->matsolve          = NULL;
1987   fact->ops->matsolvetranspose = NULL;
1988   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1989   PetscFunctionReturn(PETSC_SUCCESS);
1990 }
1991 
1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1993 {
1994   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1995   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1996   PetscInt                      m, nz;
1997 
1998   PetscFunctionBegin;
1999   if (PetscDefined(USE_DEBUG)) {
2000     PetscInt  i;
2001     PetscBool flg, missing;
2002 
2003     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2004     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2005     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2006     PetscCall(MatMissingDiagonal(A, &missing, &i));
2007     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2008   }
2009 
2010   /* Free the old stale stuff */
2011   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2012 
2013   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2014      but they will not be used. Allocate them just for easy debugging.
2015    */
2016   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2017 
2018   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2019   fact->factortype             = MAT_FACTOR_ICC;
2020   fact->info.factor_mallocs    = 0;
2021   fact->info.fill_ratio_given  = info->fill;
2022   fact->info.fill_ratio_needed = 1.0;
2023 
2024   aij->row = NULL;
2025   aij->col = NULL;
2026 
2027   /* ====================================================================== */
2028   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2029   /* We'll do in-place factorization on fact                                */
2030   /* ====================================================================== */
2031   const int *Ai, *Aj;
2032 
2033   m  = fact->rmap->n;
2034   nz = aij->nz;
2035 
2036   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2037   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2038   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2039   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2040   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2042 
2043   /* ====================================================================== */
2044   /* Create mat descriptors for M, L                                        */
2045   /* ====================================================================== */
2046   cusparseFillMode_t fillMode;
2047   cusparseDiagType_t diagType;
2048 
2049   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2050   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2051   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2052 
2053   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2054     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2055     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2056     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2057     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2058   */
2059   fillMode = CUSPARSE_FILL_MODE_LOWER;
2060   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2061   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2062   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2063   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2064 
2065   /* ========================================================================= */
2066   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2067   /* ========================================================================= */
2068   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2069   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2070 
2071   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2072   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2073 
2074   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2075   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2079 
2080   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2081   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2082 
2083   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2084      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2085    */
2086   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2087     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2088     fs->spsvBuffer_L = fs->factBuffer_M;
2089     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2090   } else {
2091     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2092     fs->spsvBuffer_Lt = fs->factBuffer_M;
2093     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2094   }
2095 
2096   /* ========================================================================== */
2097   /* Perform analysis of ic0 on M                                               */
2098   /* The lower triangular part of M has the same sparsity pattern as L          */
2099   /* ========================================================================== */
2100   int              structural_zero;
2101   cusparseStatus_t status;
2102 
2103   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2104   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2105   if (PetscDefined(USE_DEBUG)) {
2106     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2107     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2108     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2109   }
2110 
2111   /* Estimate FLOPs of the numeric factorization */
2112   {
2113     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2114     PetscInt      *Ai, nzRow, nzLeft;
2115     PetscLogDouble flops = 0.0;
2116 
2117     Ai = Aseq->i;
2118     for (PetscInt i = 0; i < m; i++) {
2119       nzRow = Ai[i + 1] - Ai[i];
2120       if (nzRow > 1) {
2121         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2122           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2123         */
2124         nzLeft = (nzRow - 1) / 2;
2125         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2126       }
2127     }
2128     fs->numericFactFlops = flops;
2129   }
2130   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2131   PetscFunctionReturn(PETSC_SUCCESS);
2132 }
2133 #endif
2134 
2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2136 {
2137   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2138   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2139 
2140   PetscFunctionBegin;
2141   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2142   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2143   B->offloadmask = PETSC_OFFLOAD_CPU;
2144 
2145   if (!cusparsestruct->use_cpu_solve) {
2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2147     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2148     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2149 #else
2150     /* determine which version of MatSolve needs to be used. */
2151     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2152     IS          isrow = b->row, iscol = b->col;
2153     PetscBool   row_identity, col_identity;
2154 
2155     PetscCall(ISIdentity(isrow, &row_identity));
2156     PetscCall(ISIdentity(iscol, &col_identity));
2157     if (row_identity && col_identity) {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2160     } else {
2161       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2162       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2163     }
2164 #endif
2165   }
2166   B->ops->matsolve          = NULL;
2167   B->ops->matsolvetranspose = NULL;
2168 
2169   /* get the triangular factors */
2170   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2175 {
2176   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2177 
2178   PetscFunctionBegin;
2179   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2180   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2181   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2182   PetscFunctionReturn(PETSC_SUCCESS);
2183 }
2184 
2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186 {
2187   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2188 
2189   PetscFunctionBegin;
2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2191   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2192   if (cusparseTriFactors->factorizeOnDevice) {
2193     PetscCall(ISIdentity(isrow, &row_identity));
2194     PetscCall(ISIdentity(iscol, &col_identity));
2195   }
2196   if (!info->levels && row_identity && col_identity) {
2197     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2198   } else
2199 #endif
2200   {
2201     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2202     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2203     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2204   }
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2209 {
2210   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2211 
2212   PetscFunctionBegin;
2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2214   PetscBool perm_identity = PETSC_FALSE;
2215   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2216   if (!info->levels && perm_identity) {
2217     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2218   } else
2219 #endif
2220   {
2221     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2222     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2223     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2224   }
2225   PetscFunctionReturn(PETSC_SUCCESS);
2226 }
2227 
2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2229 {
2230   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2231 
2232   PetscFunctionBegin;
2233   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2234   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2235   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2236   PetscFunctionReturn(PETSC_SUCCESS);
2237 }
2238 
2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2240 {
2241   PetscFunctionBegin;
2242   *type = MATSOLVERCUSPARSE;
2243   PetscFunctionReturn(PETSC_SUCCESS);
2244 }
2245 
2246 /*MC
2247   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2248   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2249   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2250   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2251   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2252   algorithms are not recommended. This class does NOT support direct solver operations.
2253 
2254   Level: beginner
2255 
2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2257           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2258 M*/
2259 
2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2261 {
2262   PetscInt  n = A->rmap->n;
2263   PetscBool factOnDevice, factOnHost;
2264   char     *prefix;
2265   char      factPlace[32] = "device"; /* the default */
2266 
2267   PetscFunctionBegin;
2268   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2269   PetscCall(MatSetSizes(*B, n, n, n, n));
2270   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2271   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2272 
2273   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2274   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
2275   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2276   PetscOptionsEnd();
2277   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2278   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2279   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2280   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2281 
2282   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2283   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2284     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2285     if (!A->boundtocpu) {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2288     } else {
2289       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2290       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2291     }
2292     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2293     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2294     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2295   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2296     if (!A->boundtocpu) {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2299     } else {
2300       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2301       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2302     }
2303     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2304     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2305   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2306 
2307   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2308   (*B)->canuseordering = PETSC_TRUE;
2309   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2310   PetscFunctionReturn(PETSC_SUCCESS);
2311 }
2312 
2313 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2314 {
2315   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2316   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2318   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2319 #endif
2320 
2321   PetscFunctionBegin;
2322   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2323     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2324     if (A->factortype == MAT_FACTOR_NONE) {
2325       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2326       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2327     }
2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2329     else if (fs->csrVal) {
2330       /* We have a factorized matrix on device and are able to copy it to host */
2331       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2332     }
2333 #endif
2334     else
2335       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2336     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2337     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2338     A->offloadmask = PETSC_OFFLOAD_BOTH;
2339   }
2340   PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342 
2343 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2344 {
2345   PetscFunctionBegin;
2346   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2347   *array = ((Mat_SeqAIJ *)A->data)->a;
2348   PetscFunctionReturn(PETSC_SUCCESS);
2349 }
2350 
2351 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2352 {
2353   PetscFunctionBegin;
2354   A->offloadmask = PETSC_OFFLOAD_CPU;
2355   *array         = NULL;
2356   PetscFunctionReturn(PETSC_SUCCESS);
2357 }
2358 
2359 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2360 {
2361   PetscFunctionBegin;
2362   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2363   *array = ((Mat_SeqAIJ *)A->data)->a;
2364   PetscFunctionReturn(PETSC_SUCCESS);
2365 }
2366 
2367 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2368 {
2369   PetscFunctionBegin;
2370   *array = NULL;
2371   PetscFunctionReturn(PETSC_SUCCESS);
2372 }
2373 
2374 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375 {
2376   PetscFunctionBegin;
2377   *array = ((Mat_SeqAIJ *)A->data)->a;
2378   PetscFunctionReturn(PETSC_SUCCESS);
2379 }
2380 
2381 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2382 {
2383   PetscFunctionBegin;
2384   A->offloadmask = PETSC_OFFLOAD_CPU;
2385   *array         = NULL;
2386   PetscFunctionReturn(PETSC_SUCCESS);
2387 }
2388 
2389 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2390 {
2391   Mat_SeqAIJCUSPARSE *cusp;
2392   CsrMatrix          *matrix;
2393 
2394   PetscFunctionBegin;
2395   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2396   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2397   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2398   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2399   matrix = (CsrMatrix *)cusp->mat->mat;
2400 
2401   if (i) {
2402 #if !defined(PETSC_USE_64BIT_INDICES)
2403     *i = matrix->row_offsets->data().get();
2404 #else
2405     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2406 #endif
2407   }
2408   if (j) {
2409 #if !defined(PETSC_USE_64BIT_INDICES)
2410     *j = matrix->column_indices->data().get();
2411 #else
2412     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2413 #endif
2414   }
2415   if (a) *a = matrix->values->data().get();
2416   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2417   PetscFunctionReturn(PETSC_SUCCESS);
2418 }
2419 
2420 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2421 {
2422   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2423   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2424   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2425   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2426   cusparseStatus_t              stat;
2427   PetscBool                     both = PETSC_TRUE;
2428 
2429   PetscFunctionBegin;
2430   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2431   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2432     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2433       CsrMatrix *matrix;
2434       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2435 
2436       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2437       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2438       matrix->values->assign(a->a, a->a + a->nz);
2439       PetscCallCUDA(WaitForCUDA());
2440       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2441       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2442       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2443     } else {
2444       PetscInt nnz;
2445       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2446       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2447       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2448       delete cusparsestruct->workVector;
2449       delete cusparsestruct->rowoffsets_gpu;
2450       cusparsestruct->workVector     = NULL;
2451       cusparsestruct->rowoffsets_gpu = NULL;
2452       try {
2453         if (a->compressedrow.use) {
2454           m    = a->compressedrow.nrows;
2455           ii   = a->compressedrow.i;
2456           ridx = a->compressedrow.rindex;
2457         } else {
2458           m    = A->rmap->n;
2459           ii   = a->i;
2460           ridx = NULL;
2461         }
2462         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2463         if (!a->a) {
2464           nnz  = ii[m];
2465           both = PETSC_FALSE;
2466         } else nnz = a->nz;
2467         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2468 
2469         /* create cusparse matrix */
2470         cusparsestruct->nrows = m;
2471         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2472         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2473         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2474         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2475 
2476         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2477         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2478         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2479         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2481         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2482         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2483 
2484         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2485         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2486           /* set the matrix */
2487           CsrMatrix *mat   = new CsrMatrix;
2488           mat->num_rows    = m;
2489           mat->num_cols    = A->cmap->n;
2490           mat->num_entries = nnz;
2491           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2492           mat->row_offsets->assign(ii, ii + m + 1);
2493 
2494           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2495           mat->column_indices->assign(a->j, a->j + nnz);
2496 
2497           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2498           if (a->a) mat->values->assign(a->a, a->a + nnz);
2499 
2500           /* assign the pointer */
2501           matstruct->mat = mat;
2502 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2503           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2504             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2505                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2506             PetscCallCUSPARSE(stat);
2507           }
2508 #endif
2509         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2510 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2511           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2512 #else
2513           CsrMatrix *mat   = new CsrMatrix;
2514           mat->num_rows    = m;
2515           mat->num_cols    = A->cmap->n;
2516           mat->num_entries = nnz;
2517           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2518           mat->row_offsets->assign(ii, ii + m + 1);
2519 
2520           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2521           mat->column_indices->assign(a->j, a->j + nnz);
2522 
2523           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2524           if (a->a) mat->values->assign(a->a, a->a + nnz);
2525 
2526           cusparseHybMat_t hybMat;
2527           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2528           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2529           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2530           PetscCallCUSPARSE(stat);
2531           /* assign the pointer */
2532           matstruct->mat = hybMat;
2533 
2534           if (mat) {
2535             if (mat->values) delete (THRUSTARRAY *)mat->values;
2536             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2537             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2538             delete (CsrMatrix *)mat;
2539           }
2540 #endif
2541         }
2542 
2543         /* assign the compressed row indices */
2544         if (a->compressedrow.use) {
2545           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2546           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2547           matstruct->cprowIndices->assign(ridx, ridx + m);
2548           tmp = m;
2549         } else {
2550           cusparsestruct->workVector = NULL;
2551           matstruct->cprowIndices    = NULL;
2552           tmp                        = 0;
2553         }
2554         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2555 
2556         /* assign the pointer */
2557         cusparsestruct->mat = matstruct;
2558       } catch (char *ex) {
2559         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2560       }
2561       PetscCallCUDA(WaitForCUDA());
2562       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2563       cusparsestruct->nonzerostate = A->nonzerostate;
2564     }
2565     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2566   }
2567   PetscFunctionReturn(PETSC_SUCCESS);
2568 }
2569 
2570 struct VecCUDAPlusEquals {
2571   template <typename Tuple>
2572   __host__ __device__ void operator()(Tuple t)
2573   {
2574     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2575   }
2576 };
2577 
2578 struct VecCUDAEquals {
2579   template <typename Tuple>
2580   __host__ __device__ void operator()(Tuple t)
2581   {
2582     thrust::get<1>(t) = thrust::get<0>(t);
2583   }
2584 };
2585 
2586 struct VecCUDAEqualsReverse {
2587   template <typename Tuple>
2588   __host__ __device__ void operator()(Tuple t)
2589   {
2590     thrust::get<0>(t) = thrust::get<1>(t);
2591   }
2592 };
2593 
2594 struct MatMatCusparse {
2595   PetscBool      cisdense;
2596   PetscScalar   *Bt;
2597   Mat            X;
2598   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2599   PetscLogDouble flops;
2600   CsrMatrix     *Bcsr;
2601 
2602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2603   cusparseSpMatDescr_t matSpBDescr;
2604   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2605   cusparseDnMatDescr_t matBDescr;
2606   cusparseDnMatDescr_t matCDescr;
2607   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2608   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2609   void *dBuffer4;
2610   void *dBuffer5;
2611   #endif
2612   size_t                mmBufferSize;
2613   void                 *mmBuffer;
2614   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2615   cusparseSpGEMMDescr_t spgemmDesc;
2616 #endif
2617 };
2618 
2619 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2620 {
2621   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2622 
2623   PetscFunctionBegin;
2624   PetscCallCUDA(cudaFree(mmdata->Bt));
2625   delete mmdata->Bcsr;
2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2627   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2628   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2629   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2630   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2631   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2632   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2633   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2634   #endif
2635   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2636   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2637 #endif
2638   PetscCall(MatDestroy(&mmdata->X));
2639   PetscCall(PetscFree(data));
2640   PetscFunctionReturn(PETSC_SUCCESS);
2641 }
2642 
2643 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2644 
2645 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2646 {
2647   Mat_Product                  *product = C->product;
2648   Mat                           A, B;
2649   PetscInt                      m, n, blda, clda;
2650   PetscBool                     flg, biscuda;
2651   Mat_SeqAIJCUSPARSE           *cusp;
2652   cusparseStatus_t              stat;
2653   cusparseOperation_t           opA;
2654   const PetscScalar            *barray;
2655   PetscScalar                  *carray;
2656   MatMatCusparse               *mmdata;
2657   Mat_SeqAIJCUSPARSEMultStruct *mat;
2658   CsrMatrix                    *csrmat;
2659 
2660   PetscFunctionBegin;
2661   MatCheckProduct(C, 1);
2662   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2663   mmdata = (MatMatCusparse *)product->data;
2664   A      = product->A;
2665   B      = product->B;
2666   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2667   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2668   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2669      Instead of silently accepting the wrong answer, I prefer to raise the error */
2670   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2671   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2672   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2673   switch (product->type) {
2674   case MATPRODUCT_AB:
2675   case MATPRODUCT_PtAP:
2676     mat = cusp->mat;
2677     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2678     m   = A->rmap->n;
2679     n   = B->cmap->n;
2680     break;
2681   case MATPRODUCT_AtB:
2682     if (!A->form_explicit_transpose) {
2683       mat = cusp->mat;
2684       opA = CUSPARSE_OPERATION_TRANSPOSE;
2685     } else {
2686       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2687       mat = cusp->matTranspose;
2688       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2689     }
2690     m = A->cmap->n;
2691     n = B->cmap->n;
2692     break;
2693   case MATPRODUCT_ABt:
2694   case MATPRODUCT_RARt:
2695     mat = cusp->mat;
2696     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2697     m   = A->rmap->n;
2698     n   = B->rmap->n;
2699     break;
2700   default:
2701     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2702   }
2703   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2704   csrmat = (CsrMatrix *)mat->mat;
2705   /* if the user passed a CPU matrix, copy the data to the GPU */
2706   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2707   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2708   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2709 
2710   PetscCall(MatDenseGetLDA(B, &blda));
2711   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2714   } else {
2715     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2716     PetscCall(MatDenseGetLDA(C, &clda));
2717   }
2718 
2719   PetscCall(PetscLogGpuTimeBegin());
2720 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2721   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2722   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2723   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2724     size_t mmBufferSize;
2725     if (mmdata->initialized && mmdata->Blda != blda) {
2726       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2727       mmdata->matBDescr = NULL;
2728     }
2729     if (!mmdata->matBDescr) {
2730       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2731       mmdata->Blda = blda;
2732     }
2733 
2734     if (mmdata->initialized && mmdata->Clda != clda) {
2735       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2736       mmdata->matCDescr = NULL;
2737     }
2738     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2739       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2740       mmdata->Clda = clda;
2741     }
2742 
2743     if (!mat->matDescr) {
2744       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2745                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2746       PetscCallCUSPARSE(stat);
2747     }
2748     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2749     PetscCallCUSPARSE(stat);
2750     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2751       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2752       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2753       mmdata->mmBufferSize = mmBufferSize;
2754     }
2755     mmdata->initialized = PETSC_TRUE;
2756   } else {
2757     /* to be safe, always update pointers of the mats */
2758     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2759     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2760     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2761   }
2762 
2763   /* do cusparseSpMM, which supports transpose on B */
2764   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2765   PetscCallCUSPARSE(stat);
2766 #else
2767   PetscInt k;
2768   /* cusparseXcsrmm does not support transpose on B */
2769   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2770     cublasHandle_t cublasv2handle;
2771     cublasStatus_t cerr;
2772 
2773     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2774     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2775     PetscCallCUBLAS(cerr);
2776     blda = B->cmap->n;
2777     k    = B->cmap->n;
2778   } else {
2779     k = B->rmap->n;
2780   }
2781 
2782   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2783   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2784   PetscCallCUSPARSE(stat);
2785 #endif
2786   PetscCall(PetscLogGpuTimeEnd());
2787   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2788   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2789   if (product->type == MATPRODUCT_RARt) {
2790     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2792   } else if (product->type == MATPRODUCT_PtAP) {
2793     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2794     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2795   } else {
2796     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2797   }
2798   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2799   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2800   PetscFunctionReturn(PETSC_SUCCESS);
2801 }
2802 
2803 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2804 {
2805   Mat_Product        *product = C->product;
2806   Mat                 A, B;
2807   PetscInt            m, n;
2808   PetscBool           cisdense, flg;
2809   MatMatCusparse     *mmdata;
2810   Mat_SeqAIJCUSPARSE *cusp;
2811 
2812   PetscFunctionBegin;
2813   MatCheckProduct(C, 1);
2814   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2815   A = product->A;
2816   B = product->B;
2817   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2818   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2819   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2820   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2821   switch (product->type) {
2822   case MATPRODUCT_AB:
2823     m = A->rmap->n;
2824     n = B->cmap->n;
2825     break;
2826   case MATPRODUCT_AtB:
2827     m = A->cmap->n;
2828     n = B->cmap->n;
2829     break;
2830   case MATPRODUCT_ABt:
2831     m = A->rmap->n;
2832     n = B->rmap->n;
2833     break;
2834   case MATPRODUCT_PtAP:
2835     m = B->cmap->n;
2836     n = B->cmap->n;
2837     break;
2838   case MATPRODUCT_RARt:
2839     m = B->rmap->n;
2840     n = B->rmap->n;
2841     break;
2842   default:
2843     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2844   }
2845   PetscCall(MatSetSizes(C, m, n, m, n));
2846   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2847   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2848   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2849 
2850   /* product data */
2851   PetscCall(PetscNew(&mmdata));
2852   mmdata->cisdense = cisdense;
2853 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2854   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2855   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2856 #endif
2857   /* for these products we need intermediate storage */
2858   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2859     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2860     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2861     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2862       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2863     } else {
2864       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2865     }
2866   }
2867   C->product->data    = mmdata;
2868   C->product->destroy = MatDestroy_MatMatCusparse;
2869 
2870   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2871   PetscFunctionReturn(PETSC_SUCCESS);
2872 }
2873 
2874 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2875 {
2876   Mat_Product                  *product = C->product;
2877   Mat                           A, B;
2878   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2879   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2880   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2881   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2882   PetscBool                     flg;
2883   cusparseStatus_t              stat;
2884   MatProductType                ptype;
2885   MatMatCusparse               *mmdata;
2886 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2887   cusparseSpMatDescr_t BmatSpDescr;
2888 #endif
2889   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2890 
2891   PetscFunctionBegin;
2892   MatCheckProduct(C, 1);
2893   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2894   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2895   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2896   mmdata = (MatMatCusparse *)C->product->data;
2897   A      = product->A;
2898   B      = product->B;
2899   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2900     mmdata->reusesym = PETSC_FALSE;
2901     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2902     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2903     Cmat = Ccusp->mat;
2904     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2905     Ccsr = (CsrMatrix *)Cmat->mat;
2906     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2907     goto finalize;
2908   }
2909   if (!c->nz) goto finalize;
2910   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2911   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2912   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2913   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2914   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2915   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2916   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2917   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2918   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2919   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2920   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2921   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2922   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2923   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2924 
2925   ptype = product->type;
2926   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2927     ptype = MATPRODUCT_AB;
2928     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2929   }
2930   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2931     ptype = MATPRODUCT_AB;
2932     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2933   }
2934   switch (ptype) {
2935   case MATPRODUCT_AB:
2936     Amat = Acusp->mat;
2937     Bmat = Bcusp->mat;
2938     break;
2939   case MATPRODUCT_AtB:
2940     Amat = Acusp->matTranspose;
2941     Bmat = Bcusp->mat;
2942     break;
2943   case MATPRODUCT_ABt:
2944     Amat = Acusp->mat;
2945     Bmat = Bcusp->matTranspose;
2946     break;
2947   default:
2948     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2949   }
2950   Cmat = Ccusp->mat;
2951   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2952   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2953   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2954   Acsr = (CsrMatrix *)Amat->mat;
2955   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2956   Ccsr = (CsrMatrix *)Cmat->mat;
2957   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2958   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2959   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2960   PetscCall(PetscLogGpuTimeBegin());
2961 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2962   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2963   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2964   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2965   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2966   PetscCallCUSPARSE(stat);
2967   #else
2968   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2969   PetscCallCUSPARSE(stat);
2970   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2971   PetscCallCUSPARSE(stat);
2972   #endif
2973 #else
2974   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2975                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2976   PetscCallCUSPARSE(stat);
2977 #endif
2978   PetscCall(PetscLogGpuFlops(mmdata->flops));
2979   PetscCallCUDA(WaitForCUDA());
2980   PetscCall(PetscLogGpuTimeEnd());
2981   C->offloadmask = PETSC_OFFLOAD_GPU;
2982 finalize:
2983   /* shorter version of MatAssemblyEnd_SeqAIJ */
2984   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2985   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2986   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2987   c->reallocs = 0;
2988   C->info.mallocs += 0;
2989   C->info.nz_unneeded = 0;
2990   C->assembled = C->was_assembled = PETSC_TRUE;
2991   C->num_ass++;
2992   PetscFunctionReturn(PETSC_SUCCESS);
2993 }
2994 
2995 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2996 {
2997   Mat_Product                  *product = C->product;
2998   Mat                           A, B;
2999   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3000   Mat_SeqAIJ                   *a, *b, *c;
3001   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3002   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3003   PetscInt                      i, j, m, n, k;
3004   PetscBool                     flg;
3005   cusparseStatus_t              stat;
3006   MatProductType                ptype;
3007   MatMatCusparse               *mmdata;
3008   PetscLogDouble                flops;
3009   PetscBool                     biscompressed, ciscompressed;
3010 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3011   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3012   cusparseSpMatDescr_t BmatSpDescr;
3013 #else
3014   int cnz;
3015 #endif
3016   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3017 
3018   PetscFunctionBegin;
3019   MatCheckProduct(C, 1);
3020   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3021   A = product->A;
3022   B = product->B;
3023   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3024   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3025   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3026   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3027   a = (Mat_SeqAIJ *)A->data;
3028   b = (Mat_SeqAIJ *)B->data;
3029   /* product data */
3030   PetscCall(PetscNew(&mmdata));
3031   C->product->data    = mmdata;
3032   C->product->destroy = MatDestroy_MatMatCusparse;
3033 
3034   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3035   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3036   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3037   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3038   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3039   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3040 
3041   ptype = product->type;
3042   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3043     ptype                                          = MATPRODUCT_AB;
3044     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3045   }
3046   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3047     ptype                                          = MATPRODUCT_AB;
3048     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3049   }
3050   biscompressed = PETSC_FALSE;
3051   ciscompressed = PETSC_FALSE;
3052   switch (ptype) {
3053   case MATPRODUCT_AB:
3054     m    = A->rmap->n;
3055     n    = B->cmap->n;
3056     k    = A->cmap->n;
3057     Amat = Acusp->mat;
3058     Bmat = Bcusp->mat;
3059     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3060     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3061     break;
3062   case MATPRODUCT_AtB:
3063     m = A->cmap->n;
3064     n = B->cmap->n;
3065     k = A->rmap->n;
3066     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3067     Amat = Acusp->matTranspose;
3068     Bmat = Bcusp->mat;
3069     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3070     break;
3071   case MATPRODUCT_ABt:
3072     m = A->rmap->n;
3073     n = B->rmap->n;
3074     k = A->cmap->n;
3075     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3076     Amat = Acusp->mat;
3077     Bmat = Bcusp->matTranspose;
3078     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3079     break;
3080   default:
3081     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3082   }
3083 
3084   /* create cusparse matrix */
3085   PetscCall(MatSetSizes(C, m, n, m, n));
3086   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3087   c     = (Mat_SeqAIJ *)C->data;
3088   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3089   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3090   Ccsr  = new CsrMatrix;
3091 
3092   c->compressedrow.use = ciscompressed;
3093   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3094     c->compressedrow.nrows = a->compressedrow.nrows;
3095     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3096     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3097     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3098     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3099     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3100   } else {
3101     c->compressedrow.nrows  = 0;
3102     c->compressedrow.i      = NULL;
3103     c->compressedrow.rindex = NULL;
3104     Ccusp->workVector       = NULL;
3105     Cmat->cprowIndices      = NULL;
3106   }
3107   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3108   Ccusp->mat        = Cmat;
3109   Ccusp->mat->mat   = Ccsr;
3110   Ccsr->num_rows    = Ccusp->nrows;
3111   Ccsr->num_cols    = n;
3112   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3113   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3114   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3115   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3116   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3117   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3118   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3119   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3120   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3121   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3122   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3123     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3124     c->nz                = 0;
3125     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3126     Ccsr->values         = new THRUSTARRAY(c->nz);
3127     goto finalizesym;
3128   }
3129 
3130   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3131   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3132   Acsr = (CsrMatrix *)Amat->mat;
3133   if (!biscompressed) {
3134     Bcsr = (CsrMatrix *)Bmat->mat;
3135 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3136     BmatSpDescr = Bmat->matDescr;
3137 #endif
3138   } else { /* we need to use row offsets for the full matrix */
3139     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3140     Bcsr                 = new CsrMatrix;
3141     Bcsr->num_rows       = B->rmap->n;
3142     Bcsr->num_cols       = cBcsr->num_cols;
3143     Bcsr->num_entries    = cBcsr->num_entries;
3144     Bcsr->column_indices = cBcsr->column_indices;
3145     Bcsr->values         = cBcsr->values;
3146     if (!Bcusp->rowoffsets_gpu) {
3147       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3148       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3149       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3150     }
3151     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3152     mmdata->Bcsr      = Bcsr;
3153 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3154     if (Bcsr->num_rows && Bcsr->num_cols) {
3155       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3156       PetscCallCUSPARSE(stat);
3157     }
3158     BmatSpDescr = mmdata->matSpBDescr;
3159 #endif
3160   }
3161   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3162   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3163   /* precompute flops count */
3164   if (ptype == MATPRODUCT_AB) {
3165     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3166       const PetscInt st = a->i[i];
3167       const PetscInt en = a->i[i + 1];
3168       for (j = st; j < en; j++) {
3169         const PetscInt brow = a->j[j];
3170         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3171       }
3172     }
3173   } else if (ptype == MATPRODUCT_AtB) {
3174     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3175       const PetscInt anzi = a->i[i + 1] - a->i[i];
3176       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3177       flops += (2. * anzi) * bnzi;
3178     }
3179   } else { /* TODO */
3180     flops = 0.;
3181   }
3182 
3183   mmdata->flops = flops;
3184   PetscCall(PetscLogGpuTimeBegin());
3185 
3186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3187   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3188   // cuda-12.2 requires non-null csrRowOffsets
3189   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3190   PetscCallCUSPARSE(stat);
3191   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3192   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3193   {
3194     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3195      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3196   */
3197     void *dBuffer1 = NULL;
3198     void *dBuffer2 = NULL;
3199     void *dBuffer3 = NULL;
3200     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3201     size_t bufferSize1 = 0;
3202     size_t bufferSize2 = 0;
3203     size_t bufferSize3 = 0;
3204     size_t bufferSize4 = 0;
3205     size_t bufferSize5 = 0;
3206 
3207     /* ask bufferSize1 bytes for external memory */
3208     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3209     PetscCallCUSPARSE(stat);
3210     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3211     /* inspect the matrices A and B to understand the memory requirement for the next step */
3212     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3213     PetscCallCUSPARSE(stat);
3214 
3215     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3216     PetscCallCUSPARSE(stat);
3217     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3218     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3219     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3220     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3221     PetscCallCUSPARSE(stat);
3222     PetscCallCUDA(cudaFree(dBuffer1));
3223     PetscCallCUDA(cudaFree(dBuffer2));
3224 
3225     /* get matrix C non-zero entries C_nnz1 */
3226     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3227     c->nz = (PetscInt)C_nnz1;
3228     /* allocate matrix C */
3229     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3230     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3231     Ccsr->values = new THRUSTARRAY(c->nz);
3232     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3233     /* update matC with the new pointers */
3234     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3235     PetscCallCUSPARSE(stat);
3236 
3237     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3238     PetscCallCUSPARSE(stat);
3239     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3240     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3241     PetscCallCUSPARSE(stat);
3242     PetscCallCUDA(cudaFree(dBuffer3));
3243     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3244     PetscCallCUSPARSE(stat);
3245     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3246   }
3247   #else
3248   size_t bufSize2;
3249   /* ask bufferSize bytes for external memory */
3250   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3251   PetscCallCUSPARSE(stat);
3252   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3253   /* inspect the matrices A and B to understand the memory requirement for the next step */
3254   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3255   PetscCallCUSPARSE(stat);
3256   /* ask bufferSize again bytes for external memory */
3257   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3258   PetscCallCUSPARSE(stat);
3259   /* The CUSPARSE documentation is not clear, nor the API
3260      We need both buffers to perform the operations properly!
3261      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3262      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3263      is stored in the descriptor! What a messy API... */
3264   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3265   /* compute the intermediate product of A * B */
3266   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3267   PetscCallCUSPARSE(stat);
3268   /* get matrix C non-zero entries C_nnz1 */
3269   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3270   c->nz = (PetscInt)C_nnz1;
3271   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3272                       mmdata->mmBufferSize / 1024));
3273   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3274   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3275   Ccsr->values = new THRUSTARRAY(c->nz);
3276   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3277   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3278   PetscCallCUSPARSE(stat);
3279   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3280   PetscCallCUSPARSE(stat);
3281   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3282 #else
3283   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3284   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3285                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3286   PetscCallCUSPARSE(stat);
3287   c->nz                = cnz;
3288   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3289   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3290   Ccsr->values = new THRUSTARRAY(c->nz);
3291   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3292 
3293   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3294   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3295      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3296      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3297   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3298                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3299   PetscCallCUSPARSE(stat);
3300 #endif
3301   PetscCall(PetscLogGpuFlops(mmdata->flops));
3302   PetscCall(PetscLogGpuTimeEnd());
3303 finalizesym:
3304   c->free_a = PETSC_TRUE;
3305   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3306   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3307   c->free_ij = PETSC_TRUE;
3308   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3309     PetscInt      *d_i = c->i;
3310     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3311     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3312     ii = *Ccsr->row_offsets;
3313     jj = *Ccsr->column_indices;
3314     if (ciscompressed) d_i = c->compressedrow.i;
3315     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3316     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3317   } else {
3318     PetscInt *d_i = c->i;
3319     if (ciscompressed) d_i = c->compressedrow.i;
3320     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3321     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3322   }
3323   if (ciscompressed) { /* need to expand host row offsets */
3324     PetscInt r = 0;
3325     c->i[0]    = 0;
3326     for (k = 0; k < c->compressedrow.nrows; k++) {
3327       const PetscInt next = c->compressedrow.rindex[k];
3328       const PetscInt old  = c->compressedrow.i[k];
3329       for (; r < next; r++) c->i[r + 1] = old;
3330     }
3331     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3332   }
3333   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3334   PetscCall(PetscMalloc1(m, &c->ilen));
3335   PetscCall(PetscMalloc1(m, &c->imax));
3336   c->maxnz         = c->nz;
3337   c->nonzerorowcnt = 0;
3338   c->rmax          = 0;
3339   for (k = 0; k < m; k++) {
3340     const PetscInt nn = c->i[k + 1] - c->i[k];
3341     c->ilen[k] = c->imax[k] = nn;
3342     c->nonzerorowcnt += (PetscInt) !!nn;
3343     c->rmax = PetscMax(c->rmax, nn);
3344   }
3345   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3346   PetscCall(PetscMalloc1(c->nz, &c->a));
3347   Ccsr->num_entries = c->nz;
3348 
3349   C->nonzerostate++;
3350   PetscCall(PetscLayoutSetUp(C->rmap));
3351   PetscCall(PetscLayoutSetUp(C->cmap));
3352   Ccusp->nonzerostate = C->nonzerostate;
3353   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3354   C->preallocated     = PETSC_TRUE;
3355   C->assembled        = PETSC_FALSE;
3356   C->was_assembled    = PETSC_FALSE;
3357   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3358     mmdata->reusesym = PETSC_TRUE;
3359     C->offloadmask   = PETSC_OFFLOAD_GPU;
3360   }
3361   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3362   PetscFunctionReturn(PETSC_SUCCESS);
3363 }
3364 
3365 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3366 
3367 /* handles sparse or dense B */
3368 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3369 {
3370   Mat_Product *product = mat->product;
3371   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3372 
3373   PetscFunctionBegin;
3374   MatCheckProduct(mat, 1);
3375   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3376   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3377   if (product->type == MATPRODUCT_ABC) {
3378     Ciscusp = PETSC_FALSE;
3379     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3380   }
3381   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3382     PetscBool usecpu = PETSC_FALSE;
3383     switch (product->type) {
3384     case MATPRODUCT_AB:
3385       if (product->api_user) {
3386         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3387         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3388         PetscOptionsEnd();
3389       } else {
3390         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3391         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3392         PetscOptionsEnd();
3393       }
3394       break;
3395     case MATPRODUCT_AtB:
3396       if (product->api_user) {
3397         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3398         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3399         PetscOptionsEnd();
3400       } else {
3401         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3402         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3403         PetscOptionsEnd();
3404       }
3405       break;
3406     case MATPRODUCT_PtAP:
3407       if (product->api_user) {
3408         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3409         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3410         PetscOptionsEnd();
3411       } else {
3412         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3413         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3414         PetscOptionsEnd();
3415       }
3416       break;
3417     case MATPRODUCT_RARt:
3418       if (product->api_user) {
3419         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3420         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3421         PetscOptionsEnd();
3422       } else {
3423         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3424         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3425         PetscOptionsEnd();
3426       }
3427       break;
3428     case MATPRODUCT_ABC:
3429       if (product->api_user) {
3430         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3431         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3432         PetscOptionsEnd();
3433       } else {
3434         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3435         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3436         PetscOptionsEnd();
3437       }
3438       break;
3439     default:
3440       break;
3441     }
3442     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3443   }
3444   /* dispatch */
3445   if (isdense) {
3446     switch (product->type) {
3447     case MATPRODUCT_AB:
3448     case MATPRODUCT_AtB:
3449     case MATPRODUCT_ABt:
3450     case MATPRODUCT_PtAP:
3451     case MATPRODUCT_RARt:
3452       if (product->A->boundtocpu) {
3453         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3454       } else {
3455         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3456       }
3457       break;
3458     case MATPRODUCT_ABC:
3459       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3460       break;
3461     default:
3462       break;
3463     }
3464   } else if (Biscusp && Ciscusp) {
3465     switch (product->type) {
3466     case MATPRODUCT_AB:
3467     case MATPRODUCT_AtB:
3468     case MATPRODUCT_ABt:
3469       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3470       break;
3471     case MATPRODUCT_PtAP:
3472     case MATPRODUCT_RARt:
3473     case MATPRODUCT_ABC:
3474       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3475       break;
3476     default:
3477       break;
3478     }
3479   } else { /* fallback for AIJ */
3480     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3481   }
3482   PetscFunctionReturn(PETSC_SUCCESS);
3483 }
3484 
3485 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3486 {
3487   PetscFunctionBegin;
3488   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3489   PetscFunctionReturn(PETSC_SUCCESS);
3490 }
3491 
3492 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3493 {
3494   PetscFunctionBegin;
3495   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3496   PetscFunctionReturn(PETSC_SUCCESS);
3497 }
3498 
3499 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3500 {
3501   PetscFunctionBegin;
3502   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3503   PetscFunctionReturn(PETSC_SUCCESS);
3504 }
3505 
3506 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3507 {
3508   PetscFunctionBegin;
3509   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3510   PetscFunctionReturn(PETSC_SUCCESS);
3511 }
3512 
3513 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3514 {
3515   PetscFunctionBegin;
3516   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3517   PetscFunctionReturn(PETSC_SUCCESS);
3518 }
3519 
3520 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3521 {
3522   int i = blockIdx.x * blockDim.x + threadIdx.x;
3523   if (i < n) y[idx[i]] += x[i];
3524 }
3525 
3526 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3527 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3528 {
3529   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3530   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3531   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3532   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3533   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3534   PetscBool                     compressed;
3535 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3536   PetscInt nx, ny;
3537 #endif
3538 
3539   PetscFunctionBegin;
3540   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3541   if (!a->nz) {
3542     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3543     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3544     PetscFunctionReturn(PETSC_SUCCESS);
3545   }
3546   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3547   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3548   if (!trans) {
3549     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3550     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3551   } else {
3552     if (herm || !A->form_explicit_transpose) {
3553       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3554       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3555     } else {
3556       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3557       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3558     }
3559   }
3560   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3561   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3562 
3563   try {
3564     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3565     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3566     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3567 
3568     PetscCall(PetscLogGpuTimeBegin());
3569     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3570       /* z = A x + beta y.
3571          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3572          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3573       */
3574       xptr = xarray;
3575       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3576       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3577 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3578       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3579           allocated to accommodate different uses. So we get the length info directly from mat.
3580        */
3581       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3582         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3583         nx             = mat->num_cols;
3584         ny             = mat->num_rows;
3585       }
3586 #endif
3587     } else {
3588       /* z = A^T x + beta y
3589          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3590          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3591        */
3592       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3593       dptr = zarray;
3594       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3595       if (compressed) { /* Scatter x to work vector */
3596         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3597 
3598         thrust::for_each(
3599 #if PetscDefined(HAVE_THRUST_ASYNC)
3600           thrust::cuda::par.on(PetscDefaultCudaStream),
3601 #endif
3602           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3603           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3604       }
3605 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3606       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3607         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3608         nx             = mat->num_rows;
3609         ny             = mat->num_cols;
3610       }
3611 #endif
3612     }
3613 
3614     /* csr_spmv does y = alpha op(A) x + beta y */
3615     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3616 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3617       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3618       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3619         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3620         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3621         PetscCallCUSPARSE(
3622           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3623         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3624 
3625         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3626       } else {
3627         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3628         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3629         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3630       }
3631 
3632       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3633                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3634 #else
3635       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3636       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3637 #endif
3638     } else {
3639       if (cusparsestruct->nrows) {
3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3642 #else
3643         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3644         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3645 #endif
3646       }
3647     }
3648     PetscCall(PetscLogGpuTimeEnd());
3649 
3650     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3651       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3652         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3653           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3654         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3655           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3656         }
3657       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3658         PetscCall(VecSeq_CUDA::Set(zz, 0));
3659       }
3660 
3661       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3662       if (compressed) {
3663         PetscCall(PetscLogGpuTimeBegin());
3664         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3665            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3666            prevent that. So I just add a ScatterAdd kernel.
3667          */
3668 #if 0
3669         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3670         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3671                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3672                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3673                          VecCUDAPlusEquals());
3674 #else
3675         PetscInt n = matstruct->cprowIndices->size();
3676         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3677 #endif
3678         PetscCall(PetscLogGpuTimeEnd());
3679       }
3680     } else {
3681       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3682     }
3683     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3684     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3685     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3686   } catch (char *ex) {
3687     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3688   }
3689   if (yy) {
3690     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3691   } else {
3692     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3693   }
3694   PetscFunctionReturn(PETSC_SUCCESS);
3695 }
3696 
3697 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3698 {
3699   PetscFunctionBegin;
3700   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3701   PetscFunctionReturn(PETSC_SUCCESS);
3702 }
3703 
3704 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3705 {
3706   PetscFunctionBegin;
3707   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3708   PetscFunctionReturn(PETSC_SUCCESS);
3709 }
3710 
3711 /*@
3712   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3713   (the default parallel PETSc format).
3714 
3715   Collective
3716 
3717   Input Parameters:
3718 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3719 . m    - number of rows
3720 . n    - number of columns
3721 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3722 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3723 
3724   Output Parameter:
3725 . A - the matrix
3726 
3727   Level: intermediate
3728 
3729   Notes:
3730   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3731   calculations. For good matrix assembly performance the user should preallocate the matrix
3732   storage by setting the parameter `nz` (or the array `nnz`).
3733 
3734   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3735   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3736   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3737 
3738   The AIJ format, also called
3739   compressed row storage, is fully compatible with standard Fortran
3740   storage.  That is, the stored row and column indices can begin at
3741   either one (as in Fortran) or zero.
3742 
3743   Specify the preallocated storage with either nz or nnz (not both).
3744   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3745   allocation.
3746 
3747 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3748 @*/
3749 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3750 {
3751   PetscFunctionBegin;
3752   PetscCall(MatCreate(comm, A));
3753   PetscCall(MatSetSizes(*A, m, n, m, n));
3754   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3755   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3756   PetscFunctionReturn(PETSC_SUCCESS);
3757 }
3758 
3759 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3760 {
3761   PetscFunctionBegin;
3762   if (A->factortype == MAT_FACTOR_NONE) {
3763     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3764   } else {
3765     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3766   }
3767   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3768   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3769   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3770   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3771   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3772   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3773   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3774   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3775   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3776   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3777   PetscCall(MatDestroy_SeqAIJ(A));
3778   PetscFunctionReturn(PETSC_SUCCESS);
3779 }
3780 
3781 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3782 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3783 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3784 {
3785   PetscFunctionBegin;
3786   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3787   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3788   PetscFunctionReturn(PETSC_SUCCESS);
3789 }
3790 
3791 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3792 {
3793   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3794   Mat_SeqAIJCUSPARSE *cy;
3795   Mat_SeqAIJCUSPARSE *cx;
3796   PetscScalar        *ay;
3797   const PetscScalar  *ax;
3798   CsrMatrix          *csry, *csrx;
3799 
3800   PetscFunctionBegin;
3801   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3802   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3803   if (X->ops->axpy != Y->ops->axpy) {
3804     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3805     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3806     PetscFunctionReturn(PETSC_SUCCESS);
3807   }
3808   /* if we are here, it means both matrices are bound to GPU */
3809   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3810   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3811   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3812   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3813   csry = (CsrMatrix *)cy->mat->mat;
3814   csrx = (CsrMatrix *)cx->mat->mat;
3815   /* see if we can turn this into a cublas axpy */
3816   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3817     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3818     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3819     if (eq) str = SAME_NONZERO_PATTERN;
3820   }
3821   /* spgeam is buggy with one column */
3822   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3823 
3824   if (str == SUBSET_NONZERO_PATTERN) {
3825     PetscScalar b = 1.0;
3826 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3827     size_t bufferSize;
3828     void  *buffer;
3829 #endif
3830 
3831     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3832     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3833     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3834 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3835     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3836                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3837     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3838     PetscCall(PetscLogGpuTimeBegin());
3839     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3840                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3841     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3842     PetscCall(PetscLogGpuTimeEnd());
3843     PetscCallCUDA(cudaFree(buffer));
3844 #else
3845     PetscCall(PetscLogGpuTimeBegin());
3846     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3847                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3848     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3849     PetscCall(PetscLogGpuTimeEnd());
3850 #endif
3851     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3852     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3853     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3854     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3855   } else if (str == SAME_NONZERO_PATTERN) {
3856     cublasHandle_t cublasv2handle;
3857     PetscBLASInt   one = 1, bnz = 1;
3858 
3859     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3860     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3861     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3862     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3863     PetscCall(PetscLogGpuTimeBegin());
3864     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3865     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3866     PetscCall(PetscLogGpuTimeEnd());
3867     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3868     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3869     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3870   } else {
3871     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3872     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3873   }
3874   PetscFunctionReturn(PETSC_SUCCESS);
3875 }
3876 
3877 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3878 {
3879   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3880   PetscScalar   *ay;
3881   cublasHandle_t cublasv2handle;
3882   PetscBLASInt   one = 1, bnz = 1;
3883 
3884   PetscFunctionBegin;
3885   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3886   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3887   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3888   PetscCall(PetscLogGpuTimeBegin());
3889   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3890   PetscCall(PetscLogGpuFlops(bnz));
3891   PetscCall(PetscLogGpuTimeEnd());
3892   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3893   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3894   PetscFunctionReturn(PETSC_SUCCESS);
3895 }
3896 
3897 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3898 {
3899   PetscBool   both = PETSC_FALSE;
3900   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3901 
3902   PetscFunctionBegin;
3903   if (A->factortype == MAT_FACTOR_NONE) {
3904     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3905     if (spptr->mat) {
3906       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3907       if (matrix->values) {
3908         both = PETSC_TRUE;
3909         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3910       }
3911     }
3912     if (spptr->matTranspose) {
3913       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3914       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3915     }
3916   }
3917   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3918   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3919   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3920   else A->offloadmask = PETSC_OFFLOAD_CPU;
3921   PetscFunctionReturn(PETSC_SUCCESS);
3922 }
3923 
3924 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3925 {
3926   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3927 
3928   PetscFunctionBegin;
3929   if (A->factortype != MAT_FACTOR_NONE) {
3930     A->boundtocpu = flg;
3931     PetscFunctionReturn(PETSC_SUCCESS);
3932   }
3933   if (flg) {
3934     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3935 
3936     A->ops->scale                     = MatScale_SeqAIJ;
3937     A->ops->axpy                      = MatAXPY_SeqAIJ;
3938     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3939     A->ops->mult                      = MatMult_SeqAIJ;
3940     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3941     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3942     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3943     A->ops->multhermitiantranspose    = NULL;
3944     A->ops->multhermitiantransposeadd = NULL;
3945     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3946     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3947     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3948     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3949     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3950     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3951     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3952     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3953   } else {
3954     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3955     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3956     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3957     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3958     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3959     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3960     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3961     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3962     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3963     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3964     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3965     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3966     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3967     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3968     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3969     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3970     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3971 
3972     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3973     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3974     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3975     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3976     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3977     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3978   }
3979   A->boundtocpu = flg;
3980   if (flg && a->inode.size) {
3981     a->inode.use = PETSC_TRUE;
3982   } else {
3983     a->inode.use = PETSC_FALSE;
3984   }
3985   PetscFunctionReturn(PETSC_SUCCESS);
3986 }
3987 
3988 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3989 {
3990   Mat B;
3991 
3992   PetscFunctionBegin;
3993   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3994   if (reuse == MAT_INITIAL_MATRIX) {
3995     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3996   } else if (reuse == MAT_REUSE_MATRIX) {
3997     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3998   }
3999   B = *newmat;
4000 
4001   PetscCall(PetscFree(B->defaultvectype));
4002   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4003 
4004   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4005     if (B->factortype == MAT_FACTOR_NONE) {
4006       Mat_SeqAIJCUSPARSE *spptr;
4007       PetscCall(PetscNew(&spptr));
4008       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4009       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4010       spptr->format = MAT_CUSPARSE_CSR;
4011 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4012   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4013       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4014   #else
4015       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4016   #endif
4017       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4018       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4019 #endif
4020       B->spptr = spptr;
4021     } else {
4022       Mat_SeqAIJCUSPARSETriFactors *spptr;
4023 
4024       PetscCall(PetscNew(&spptr));
4025       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4026       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4027       B->spptr = spptr;
4028     }
4029     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4030   }
4031   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4032   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4033   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4034   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4035   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4036   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4037 
4038   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4039   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4040   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4041 #if defined(PETSC_HAVE_HYPRE)
4042   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4043 #endif
4044   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4045   PetscFunctionReturn(PETSC_SUCCESS);
4046 }
4047 
4048 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4049 {
4050   PetscFunctionBegin;
4051   PetscCall(MatCreate_SeqAIJ(B));
4052   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4053   PetscFunctionReturn(PETSC_SUCCESS);
4054 }
4055 
4056 /*MC
4057    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4058 
4059    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4060    CSR, ELL, or Hybrid format.
4061    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4062 
4063    Options Database Keys:
4064 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4065 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4066                                       Other options include ell (ellpack) or hyb (hybrid).
4067 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4068 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4069 
4070   Level: beginner
4071 
4072 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4073 M*/
4074 
4075 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4076 {
4077   PetscFunctionBegin;
4078   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4079   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4080   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4081   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4082   PetscFunctionReturn(PETSC_SUCCESS);
4083 }
4084 
4085 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4086 {
4087   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4088 
4089   PetscFunctionBegin;
4090   if (cusp) {
4091     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4092     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4093     delete cusp->workVector;
4094     delete cusp->rowoffsets_gpu;
4095     delete cusp->csr2csc_i;
4096     delete cusp->coords;
4097     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4098     PetscCall(PetscFree(mat->spptr));
4099   }
4100   PetscFunctionReturn(PETSC_SUCCESS);
4101 }
4102 
4103 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4104 {
4105   PetscFunctionBegin;
4106   if (*mat) {
4107     delete (*mat)->values;
4108     delete (*mat)->column_indices;
4109     delete (*mat)->row_offsets;
4110     delete *mat;
4111     *mat = 0;
4112   }
4113   PetscFunctionReturn(PETSC_SUCCESS);
4114 }
4115 
4116 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4117 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4118 {
4119   PetscFunctionBegin;
4120   if (*trifactor) {
4121     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4122     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4123     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4124     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4125     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4126   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4127     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4128   #endif
4129     PetscCall(PetscFree(*trifactor));
4130   }
4131   PetscFunctionReturn(PETSC_SUCCESS);
4132 }
4133 #endif
4134 
4135 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4136 {
4137   CsrMatrix *mat;
4138 
4139   PetscFunctionBegin;
4140   if (*matstruct) {
4141     if ((*matstruct)->mat) {
4142       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4143 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4144         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4145 #else
4146         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4147         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4148 #endif
4149       } else {
4150         mat = (CsrMatrix *)(*matstruct)->mat;
4151         PetscCall(CsrMatrix_Destroy(&mat));
4152       }
4153     }
4154     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4155     delete (*matstruct)->cprowIndices;
4156     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4157     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4158     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4159 
4160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4161     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4162     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4163     for (int i = 0; i < 3; i++) {
4164       if (mdata->cuSpMV[i].initialized) {
4165         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4166         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4167         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4168       }
4169     }
4170 #endif
4171     delete *matstruct;
4172     *matstruct = NULL;
4173   }
4174   PetscFunctionReturn(PETSC_SUCCESS);
4175 }
4176 
4177 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4178 {
4179   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4180 
4181   PetscFunctionBegin;
4182   if (fs) {
4183 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4184     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4185     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4186     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4187     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4188     delete fs->workVector;
4189     fs->workVector = NULL;
4190 #endif
4191     delete fs->rpermIndices;
4192     delete fs->cpermIndices;
4193     fs->rpermIndices  = NULL;
4194     fs->cpermIndices  = NULL;
4195     fs->init_dev_prop = PETSC_FALSE;
4196 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4197     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4198     PetscCallCUDA(cudaFree(fs->csrColIdx));
4199     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4200     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4201     PetscCallCUDA(cudaFree(fs->csrVal));
4202     PetscCallCUDA(cudaFree(fs->diag));
4203     PetscCallCUDA(cudaFree(fs->X));
4204     PetscCallCUDA(cudaFree(fs->Y));
4205     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4206     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4207     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4208     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4209     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4210     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4211     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4212     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4213     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4214     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4215     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4216     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4217     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4218     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4219     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4220     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4221     PetscCall(PetscFree(fs->csrRowPtr_h));
4222     PetscCall(PetscFree(fs->csrVal_h));
4223     PetscCall(PetscFree(fs->diag_h));
4224     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4225     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4226 #endif
4227   }
4228   PetscFunctionReturn(PETSC_SUCCESS);
4229 }
4230 
4231 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4232 {
4233   PetscFunctionBegin;
4234   if (*trifactors) {
4235     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4236     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4237     PetscCall(PetscFree(*trifactors));
4238   }
4239   PetscFunctionReturn(PETSC_SUCCESS);
4240 }
4241 
4242 struct IJCompare {
4243   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4244   {
4245     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4246     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4247     return false;
4248   }
4249 };
4250 
4251 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4252 {
4253   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4254 
4255   PetscFunctionBegin;
4256   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4257   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4258   if (destroy) {
4259     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4260     delete cusp->csr2csc_i;
4261     cusp->csr2csc_i = NULL;
4262   }
4263   A->transupdated = PETSC_FALSE;
4264   PetscFunctionReturn(PETSC_SUCCESS);
4265 }
4266 
4267 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4268 {
4269   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
4270 
4271   PetscFunctionBegin;
4272   PetscCallCUDA(cudaFree(coo->perm));
4273   PetscCallCUDA(cudaFree(coo->jmap));
4274   PetscCall(PetscFree(coo));
4275   PetscFunctionReturn(PETSC_SUCCESS);
4276 }
4277 
4278 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4279 {
4280   PetscBool            dev_ij = PETSC_FALSE;
4281   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4282   PetscInt            *i, *j;
4283   PetscContainer       container_h, container_d;
4284   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4285 
4286   PetscFunctionBegin;
4287   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4288   PetscCall(PetscGetMemType(coo_i, &mtype));
4289   if (PetscMemTypeDevice(mtype)) {
4290     dev_ij = PETSC_TRUE;
4291     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4292     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4293     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4294   } else {
4295     i = coo_i;
4296     j = coo_j;
4297   }
4298 
4299   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4300   if (dev_ij) PetscCall(PetscFree2(i, j));
4301   mat->offloadmask = PETSC_OFFLOAD_CPU;
4302   // Create the GPU memory
4303   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4304 
4305   // Copy the COO struct to device
4306   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4307   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4308   PetscCall(PetscMalloc1(1, &coo_d));
4309   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4310   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4311   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4312   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4313   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4314 
4315   // Put the COO struct in a container and then attach that to the matrix
4316   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
4317   PetscCall(PetscContainerSetPointer(container_d, coo_d));
4318   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4319   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
4320   PetscCall(PetscContainerDestroy(&container_d));
4321   PetscFunctionReturn(PETSC_SUCCESS);
4322 }
4323 
4324 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4325 {
4326   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4327   const PetscCount grid_size = gridDim.x * blockDim.x;
4328   for (; i < nnz; i += grid_size) {
4329     PetscScalar sum = 0.0;
4330     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4331     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4332   }
4333 }
4334 
4335 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4336 {
4337   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4338   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4339   PetscCount           Annz = seq->nz;
4340   PetscMemType         memtype;
4341   const PetscScalar   *v1 = v;
4342   PetscScalar         *Aa;
4343   PetscContainer       container;
4344   MatCOOStruct_SeqAIJ *coo;
4345 
4346   PetscFunctionBegin;
4347   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4348 
4349   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4350   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4351 
4352   PetscCall(PetscGetMemType(v, &memtype));
4353   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4354     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4355     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4356   }
4357 
4358   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4359   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4360 
4361   PetscCall(PetscLogGpuTimeBegin());
4362   if (Annz) {
4363     MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4364     PetscCallCUDA(cudaPeekAtLastError());
4365   }
4366   PetscCall(PetscLogGpuTimeEnd());
4367 
4368   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4369   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4370 
4371   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4372   PetscFunctionReturn(PETSC_SUCCESS);
4373 }
4374 
4375 /*@C
4376   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4377 
4378   Not Collective
4379 
4380   Input Parameters:
4381 + A          - the matrix
4382 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4383 
4384   Output Parameters:
4385 + i - the CSR row pointers
4386 - j - the CSR column indices
4387 
4388   Level: developer
4389 
4390   Note:
4391   When compressed is true, the CSR structure does not contain empty rows
4392 
4393 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4394 @*/
4395 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4396 {
4397   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4398   CsrMatrix          *csr;
4399   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4400 
4401   PetscFunctionBegin;
4402   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4403   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4404   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4405   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4406   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4407   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4408   csr = (CsrMatrix *)cusp->mat->mat;
4409   if (i) {
4410     if (!compressed && a->compressedrow.use) { /* need full row offset */
4411       if (!cusp->rowoffsets_gpu) {
4412         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4413         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4414         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4415       }
4416       *i = cusp->rowoffsets_gpu->data().get();
4417     } else *i = csr->row_offsets->data().get();
4418   }
4419   if (j) *j = csr->column_indices->data().get();
4420   PetscFunctionReturn(PETSC_SUCCESS);
4421 }
4422 
4423 /*@C
4424   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4425 
4426   Not Collective
4427 
4428   Input Parameters:
4429 + A          - the matrix
4430 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4431 . i          - the CSR row pointers
4432 - j          - the CSR column indices
4433 
4434   Level: developer
4435 
4436 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4437 @*/
4438 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4439 {
4440   PetscFunctionBegin;
4441   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4442   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4443   if (i) *i = NULL;
4444   if (j) *j = NULL;
4445   (void)compressed;
4446   PetscFunctionReturn(PETSC_SUCCESS);
4447 }
4448 
4449 /*@C
4450   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4451 
4452   Not Collective
4453 
4454   Input Parameter:
4455 . A - a `MATSEQAIJCUSPARSE` matrix
4456 
4457   Output Parameter:
4458 . a - pointer to the device data
4459 
4460   Level: developer
4461 
4462   Note:
4463   May trigger host-device copies if up-to-date matrix data is on host
4464 
4465 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4466 @*/
4467 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4468 {
4469   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4470   CsrMatrix          *csr;
4471 
4472   PetscFunctionBegin;
4473   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4474   PetscAssertPointer(a, 2);
4475   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4476   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4477   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4478   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4479   csr = (CsrMatrix *)cusp->mat->mat;
4480   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4481   *a = csr->values->data().get();
4482   PetscFunctionReturn(PETSC_SUCCESS);
4483 }
4484 
4485 /*@C
4486   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4487 
4488   Not Collective
4489 
4490   Input Parameters:
4491 + A - a `MATSEQAIJCUSPARSE` matrix
4492 - a - pointer to the device data
4493 
4494   Level: developer
4495 
4496 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4497 @*/
4498 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4499 {
4500   PetscFunctionBegin;
4501   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4502   PetscAssertPointer(a, 2);
4503   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4504   *a = NULL;
4505   PetscFunctionReturn(PETSC_SUCCESS);
4506 }
4507 
4508 /*@C
4509   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4510 
4511   Not Collective
4512 
4513   Input Parameter:
4514 . A - a `MATSEQAIJCUSPARSE` matrix
4515 
4516   Output Parameter:
4517 . a - pointer to the device data
4518 
4519   Level: developer
4520 
4521   Note:
4522   May trigger host-device copies if up-to-date matrix data is on host
4523 
4524 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4525 @*/
4526 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4527 {
4528   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4529   CsrMatrix          *csr;
4530 
4531   PetscFunctionBegin;
4532   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4533   PetscAssertPointer(a, 2);
4534   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4535   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4536   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4537   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4538   csr = (CsrMatrix *)cusp->mat->mat;
4539   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4540   *a             = csr->values->data().get();
4541   A->offloadmask = PETSC_OFFLOAD_GPU;
4542   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4543   PetscFunctionReturn(PETSC_SUCCESS);
4544 }
4545 /*@C
4546   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4547 
4548   Not Collective
4549 
4550   Input Parameters:
4551 + A - a `MATSEQAIJCUSPARSE` matrix
4552 - a - pointer to the device data
4553 
4554   Level: developer
4555 
4556 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4557 @*/
4558 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4559 {
4560   PetscFunctionBegin;
4561   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4562   PetscAssertPointer(a, 2);
4563   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4564   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4565   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4566   *a = NULL;
4567   PetscFunctionReturn(PETSC_SUCCESS);
4568 }
4569 
4570 /*@C
4571   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4572 
4573   Not Collective
4574 
4575   Input Parameter:
4576 . A - a `MATSEQAIJCUSPARSE` matrix
4577 
4578   Output Parameter:
4579 . a - pointer to the device data
4580 
4581   Level: developer
4582 
4583   Note:
4584   Does not trigger host-device copies and flags data validity on the GPU
4585 
4586 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4587 @*/
4588 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4589 {
4590   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4591   CsrMatrix          *csr;
4592 
4593   PetscFunctionBegin;
4594   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4595   PetscAssertPointer(a, 2);
4596   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4597   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4598   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4599   csr = (CsrMatrix *)cusp->mat->mat;
4600   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4601   *a             = csr->values->data().get();
4602   A->offloadmask = PETSC_OFFLOAD_GPU;
4603   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4604   PetscFunctionReturn(PETSC_SUCCESS);
4605 }
4606 
4607 /*@C
4608   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4609 
4610   Not Collective
4611 
4612   Input Parameters:
4613 + A - a `MATSEQAIJCUSPARSE` matrix
4614 - a - pointer to the device data
4615 
4616   Level: developer
4617 
4618 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4619 @*/
4620 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4621 {
4622   PetscFunctionBegin;
4623   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4624   PetscAssertPointer(a, 2);
4625   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4626   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4627   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4628   *a = NULL;
4629   PetscFunctionReturn(PETSC_SUCCESS);
4630 }
4631 
4632 struct IJCompare4 {
4633   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4634   {
4635     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4636     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4637     return false;
4638   }
4639 };
4640 
4641 struct Shift {
4642   int _shift;
4643 
4644   Shift(int shift) : _shift(shift) { }
4645   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4646 };
4647 
4648 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4649 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4650 {
4651   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4652   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4653   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4654   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4655   PetscInt                      Annz, Bnnz;
4656   cusparseStatus_t              stat;
4657   PetscInt                      i, m, n, zero = 0;
4658 
4659   PetscFunctionBegin;
4660   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4661   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4662   PetscAssertPointer(C, 4);
4663   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4664   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4665   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4666   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4667   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4668   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4669   if (reuse == MAT_INITIAL_MATRIX) {
4670     m = A->rmap->n;
4671     n = A->cmap->n + B->cmap->n;
4672     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4673     PetscCall(MatSetSizes(*C, m, n, m, n));
4674     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4675     c                       = (Mat_SeqAIJ *)(*C)->data;
4676     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4677     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4678     Ccsr                    = new CsrMatrix;
4679     Cmat->cprowIndices      = NULL;
4680     c->compressedrow.use    = PETSC_FALSE;
4681     c->compressedrow.nrows  = 0;
4682     c->compressedrow.i      = NULL;
4683     c->compressedrow.rindex = NULL;
4684     Ccusp->workVector       = NULL;
4685     Ccusp->nrows            = m;
4686     Ccusp->mat              = Cmat;
4687     Ccusp->mat->mat         = Ccsr;
4688     Ccsr->num_rows          = m;
4689     Ccsr->num_cols          = n;
4690     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4691     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4692     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4693     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4694     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4695     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4696     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4697     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4698     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4699     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4700     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4701     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4702     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4703 
4704     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4705     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4706     Annz                 = (PetscInt)Acsr->column_indices->size();
4707     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4708     c->nz                = Annz + Bnnz;
4709     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4710     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4711     Ccsr->values         = new THRUSTARRAY(c->nz);
4712     Ccsr->num_entries    = c->nz;
4713     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4714     if (c->nz) {
4715       auto              Acoo = new THRUSTINTARRAY32(Annz);
4716       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4717       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4718       THRUSTINTARRAY32 *Aroff, *Broff;
4719 
4720       if (a->compressedrow.use) { /* need full row offset */
4721         if (!Acusp->rowoffsets_gpu) {
4722           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4723           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4724           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4725         }
4726         Aroff = Acusp->rowoffsets_gpu;
4727       } else Aroff = Acsr->row_offsets;
4728       if (b->compressedrow.use) { /* need full row offset */
4729         if (!Bcusp->rowoffsets_gpu) {
4730           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4731           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4732           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4733         }
4734         Broff = Bcusp->rowoffsets_gpu;
4735       } else Broff = Bcsr->row_offsets;
4736       PetscCall(PetscLogGpuTimeBegin());
4737       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4738       PetscCallCUSPARSE(stat);
4739       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4740       PetscCallCUSPARSE(stat);
4741       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4742       auto Aperm = thrust::make_constant_iterator(1);
4743       auto Bperm = thrust::make_constant_iterator(0);
4744 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4745       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4746       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4747 #else
4748       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4749       auto Bcib = Bcsr->column_indices->begin();
4750       auto Bcie = Bcsr->column_indices->end();
4751       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4752 #endif
4753       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4754       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4755       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4756       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4757       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4758       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4759       auto p1    = Ccusp->coords->begin();
4760       auto p2    = Ccusp->coords->begin();
4761       thrust::advance(p2, Annz);
4762       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4763 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4764       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4765 #endif
4766       auto cci = thrust::make_counting_iterator(zero);
4767       auto cce = thrust::make_counting_iterator(c->nz);
4768 #if 0 //Errors on SUMMIT cuda 11.1.0
4769       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4770 #else
4771       auto pred = thrust::identity<int>();
4772       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4773       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4774 #endif
4775       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4776       PetscCallCUSPARSE(stat);
4777       PetscCall(PetscLogGpuTimeEnd());
4778       delete wPerm;
4779       delete Acoo;
4780       delete Bcoo;
4781       delete Ccoo;
4782 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4783       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4784       PetscCallCUSPARSE(stat);
4785 #endif
4786       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4787         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4788         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4789         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4790         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4791         CsrMatrix                    *CcsrT = new CsrMatrix;
4792         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4793         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4794 
4795         (*C)->form_explicit_transpose = PETSC_TRUE;
4796         (*C)->transupdated            = PETSC_TRUE;
4797         Ccusp->rowoffsets_gpu         = NULL;
4798         CmatT->cprowIndices           = NULL;
4799         CmatT->mat                    = CcsrT;
4800         CcsrT->num_rows               = n;
4801         CcsrT->num_cols               = m;
4802         CcsrT->num_entries            = c->nz;
4803 
4804         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4805         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4806         CcsrT->values         = new THRUSTARRAY(c->nz);
4807 
4808         PetscCall(PetscLogGpuTimeBegin());
4809         auto rT = CcsrT->row_offsets->begin();
4810         if (AT) {
4811           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4812           thrust::advance(rT, -1);
4813         }
4814         if (BT) {
4815           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4816           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4817           thrust::copy(titb, tite, rT);
4818         }
4819         auto cT = CcsrT->column_indices->begin();
4820         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4821         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4822         auto vT = CcsrT->values->begin();
4823         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4824         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4825         PetscCall(PetscLogGpuTimeEnd());
4826 
4827         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4828         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4829         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4830         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4831         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4832         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4833         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4834         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4835         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4836 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4837         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4838         PetscCallCUSPARSE(stat);
4839 #endif
4840         Ccusp->matTranspose = CmatT;
4841       }
4842     }
4843 
4844     c->free_a = PETSC_TRUE;
4845     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4846     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4847     c->free_ij = PETSC_TRUE;
4848     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4849       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4850       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4851       ii = *Ccsr->row_offsets;
4852       jj = *Ccsr->column_indices;
4853       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4854       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4855     } else {
4856       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4857       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4858     }
4859     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4860     PetscCall(PetscMalloc1(m, &c->ilen));
4861     PetscCall(PetscMalloc1(m, &c->imax));
4862     c->maxnz         = c->nz;
4863     c->nonzerorowcnt = 0;
4864     c->rmax          = 0;
4865     for (i = 0; i < m; i++) {
4866       const PetscInt nn = c->i[i + 1] - c->i[i];
4867       c->ilen[i] = c->imax[i] = nn;
4868       c->nonzerorowcnt += (PetscInt) !!nn;
4869       c->rmax = PetscMax(c->rmax, nn);
4870     }
4871     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4872     PetscCall(PetscMalloc1(c->nz, &c->a));
4873     (*C)->nonzerostate++;
4874     PetscCall(PetscLayoutSetUp((*C)->rmap));
4875     PetscCall(PetscLayoutSetUp((*C)->cmap));
4876     Ccusp->nonzerostate = (*C)->nonzerostate;
4877     (*C)->preallocated  = PETSC_TRUE;
4878   } else {
4879     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4880     c = (Mat_SeqAIJ *)(*C)->data;
4881     if (c->nz) {
4882       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4883       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4884       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4885       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4886       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4887       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4888       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4889       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4890       Acsr = (CsrMatrix *)Acusp->mat->mat;
4891       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4892       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4893       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4894       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4895       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4896       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4897       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4898       auto pmid = Ccusp->coords->begin();
4899       thrust::advance(pmid, Acsr->num_entries);
4900       PetscCall(PetscLogGpuTimeBegin());
4901       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4902       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4903       thrust::for_each(zibait, zieait, VecCUDAEquals());
4904       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4905       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4906       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4907       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4908       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4909         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4910         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4911         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4912         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4913         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4914         auto       vT    = CcsrT->values->begin();
4915         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4916         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4917         (*C)->transupdated = PETSC_TRUE;
4918       }
4919       PetscCall(PetscLogGpuTimeEnd());
4920     }
4921   }
4922   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4923   (*C)->assembled     = PETSC_TRUE;
4924   (*C)->was_assembled = PETSC_FALSE;
4925   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4926   PetscFunctionReturn(PETSC_SUCCESS);
4927 }
4928 
4929 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4930 {
4931   bool               dmem;
4932   const PetscScalar *av;
4933 
4934   PetscFunctionBegin;
4935   dmem = isCudaMem(v);
4936   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4937   if (n && idx) {
4938     THRUSTINTARRAY widx(n);
4939     widx.assign(idx, idx + n);
4940     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4941 
4942     THRUSTARRAY                    *w = NULL;
4943     thrust::device_ptr<PetscScalar> dv;
4944     if (dmem) {
4945       dv = thrust::device_pointer_cast(v);
4946     } else {
4947       w  = new THRUSTARRAY(n);
4948       dv = w->data();
4949     }
4950     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4951 
4952     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4953     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4954     thrust::for_each(zibit, zieit, VecCUDAEquals());
4955     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4956     delete w;
4957   } else {
4958     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4959   }
4960   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4961   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4962   PetscFunctionReturn(PETSC_SUCCESS);
4963 }
4964 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
4965