xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6dd63270497ad23dcf16ae500a87ff2b2a0b7474)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73 #endif
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
89 
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
92 
93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96 
97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98 {
99   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
100 
101   PetscFunctionBegin;
102   switch (op) {
103   case MAT_CUSPARSE_MULT:
104     cusparsestruct->format = format;
105     break;
106   case MAT_CUSPARSE_ALL:
107     cusparsestruct->format = format;
108     break;
109   default:
110     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111   }
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
115 /*@
116   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
117   operation. Only the `MatMult()` operation can use different GPU storage formats
118 
119   Not Collective
120 
121   Input Parameters:
122 + A      - Matrix of type `MATSEQAIJCUSPARSE`
123 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
124         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126 
127   Level: intermediate
128 
129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130 @*/
131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132 {
133   PetscFunctionBegin;
134   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140 {
141   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142 
143   PetscFunctionBegin;
144   cusparsestruct->use_cpu_solve = use_cpu;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150 
151   Input Parameters:
152 + A       - Matrix of type `MATSEQAIJCUSPARSE`
153 - use_cpu - set flag for using the built-in CPU `MatSolve()`
154 
155   Level: intermediate
156 
157   Note:
158   The cuSparse LU solver currently computes the factors with the built-in CPU method
159   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161 
162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163 @*/
164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165 {
166   PetscFunctionBegin;
167   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173 {
174   PetscFunctionBegin;
175   switch (op) {
176   case MAT_FORM_EXPLICIT_TRANSPOSE:
177     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
178     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
179     A->form_explicit_transpose = flg;
180     break;
181   default:
182     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183     break;
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
189 {
190   MatCUSPARSEStorageFormat format;
191   PetscBool                flg;
192   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
193 
194   PetscFunctionBegin;
195   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
196   if (A->factortype == MAT_FACTOR_NONE) {
197     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
198     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199 
200     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
201     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
202     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
203     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
205     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209   #else
210     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211   #endif
212     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214 
215     PetscCall(
216       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218 #endif
219   }
220   PetscOptionsHeadEnd();
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226 {
227   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228   PetscInt                      m  = A->rmap->n;
229   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231   const MatScalar              *Aa = a->a;
232   PetscInt                     *Mi, *Mj, Mnz;
233   PetscScalar                  *Ma;
234 
235   PetscFunctionBegin;
236   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240       PetscCall(PetscMalloc1(m + 1, &Mi));
241       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242       PetscCall(PetscMalloc1(Mnz, &Ma));
243       Mi[0] = 0;
244       for (PetscInt i = 0; i < m; i++) {
245         PetscInt llen = Ai[i + 1] - Ai[i];
246         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250         Mi[i + 1] = Mi[i] + llen + ulen;
251       }
252       // Copy M (L,U) from host to device
253       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258 
259       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267 
268       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271 
272       fillMode = CUSPARSE_FILL_MODE_UPPER;
273       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277 
278       // Allocate work vectors in SpSv
279       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281 
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284 
285       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292 
293       // Record for reuse
294       fs->csrRowPtr_h = Mi;
295       fs->csrVal_h    = Ma;
296       PetscCall(PetscFree(Mj));
297     }
298     // Copy the value
299     Mi  = fs->csrRowPtr_h;
300     Ma  = fs->csrVal_h;
301     Mnz = Mi[m];
302     for (PetscInt i = 0; i < m; i++) {
303       PetscInt llen = Ai[i + 1] - Ai[i];
304       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308     }
309     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310 
311     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
312     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
313 
314     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
315 
316     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
317     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
318   }
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 #else
322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
323 {
324   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
325   PetscInt                           n                  = A->rmap->n;
326   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
327   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
328   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
329   const MatScalar                   *aa = a->a, *v;
330   PetscInt                          *AiLo, *AjLo;
331   PetscInt                           i, nz, nzLower, offset, rowOffset;
332 
333   PetscFunctionBegin;
334   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
335   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
336     try {
337       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
338       nzLower = n + ai[n] - ai[1];
339       if (!loTriFactor) {
340         PetscScalar *AALo;
341 
342         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
343 
344         /* Allocate Space for the lower triangular matrix */
345         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
346         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
347 
348         /* Fill the lower triangular matrix */
349         AiLo[0]   = (PetscInt)0;
350         AiLo[n]   = nzLower;
351         AjLo[0]   = (PetscInt)0;
352         AALo[0]   = (MatScalar)1.0;
353         v         = aa;
354         vi        = aj;
355         offset    = 1;
356         rowOffset = 1;
357         for (i = 1; i < n; i++) {
358           nz = ai[i + 1] - ai[i];
359           /* additional 1 for the term on the diagonal */
360           AiLo[i] = rowOffset;
361           rowOffset += nz + 1;
362 
363           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
364           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
365 
366           offset += nz;
367           AjLo[offset] = (PetscInt)i;
368           AALo[offset] = (MatScalar)1.0;
369           offset += 1;
370 
371           v += nz;
372           vi += nz;
373         }
374 
375         /* allocate space for the triangular factor information */
376         PetscCall(PetscNew(&loTriFactor));
377         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
378         /* Create the matrix description */
379         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
380         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
381   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
382         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
383   #else
384         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
385   #endif
386         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
387         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
388 
389         /* set the operation */
390         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
391 
392         /* set the matrix */
393         loTriFactor->csrMat              = new CsrMatrix;
394         loTriFactor->csrMat->num_rows    = n;
395         loTriFactor->csrMat->num_cols    = n;
396         loTriFactor->csrMat->num_entries = nzLower;
397 
398         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
399         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
400 
401         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
402         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
403 
404         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
405         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
406 
407         /* Create the solve analysis information */
408         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
409         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
410   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
411         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
412                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
413         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
414   #endif
415 
416         /* perform the solve analysis */
417         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
418                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
419         PetscCallCUDA(WaitForCUDA());
420         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
421 
422         /* assign the pointer */
423         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
424         loTriFactor->AA_h                                          = AALo;
425         PetscCallCUDA(cudaFreeHost(AiLo));
426         PetscCallCUDA(cudaFreeHost(AjLo));
427         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
428       } else { /* update values only */
429         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
430         /* Fill the lower triangular matrix */
431         loTriFactor->AA_h[0] = 1.0;
432         v                    = aa;
433         vi                   = aj;
434         offset               = 1;
435         for (i = 1; i < n; i++) {
436           nz = ai[i + 1] - ai[i];
437           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
438           offset += nz;
439           loTriFactor->AA_h[offset] = 1.0;
440           offset += 1;
441           v += nz;
442         }
443         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
444         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
445       }
446     } catch (char *ex) {
447       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
448     }
449   }
450   PetscFunctionReturn(PETSC_SUCCESS);
451 }
452 
453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
454 {
455   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
456   PetscInt                           n                  = A->rmap->n;
457   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
458   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
459   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
460   const MatScalar                   *aa = a->a, *v;
461   PetscInt                          *AiUp, *AjUp;
462   PetscInt                           i, nz, nzUpper, offset;
463 
464   PetscFunctionBegin;
465   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
466   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
467     try {
468       /* next, figure out the number of nonzeros in the upper triangular matrix. */
469       nzUpper = adiag[0] - adiag[n];
470       if (!upTriFactor) {
471         PetscScalar *AAUp;
472 
473         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
474 
475         /* Allocate Space for the upper triangular matrix */
476         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
477         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
478 
479         /* Fill the upper triangular matrix */
480         AiUp[0] = (PetscInt)0;
481         AiUp[n] = nzUpper;
482         offset  = nzUpper;
483         for (i = n - 1; i >= 0; i--) {
484           v  = aa + adiag[i + 1] + 1;
485           vi = aj + adiag[i + 1] + 1;
486 
487           /* number of elements NOT on the diagonal */
488           nz = adiag[i] - adiag[i + 1] - 1;
489 
490           /* decrement the offset */
491           offset -= (nz + 1);
492 
493           /* first, set the diagonal elements */
494           AjUp[offset] = (PetscInt)i;
495           AAUp[offset] = (MatScalar)1. / v[nz];
496           AiUp[i]      = AiUp[i + 1] - (nz + 1);
497 
498           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
499           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
500         }
501 
502         /* allocate space for the triangular factor information */
503         PetscCall(PetscNew(&upTriFactor));
504         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
505 
506         /* Create the matrix description */
507         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
508         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
509   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
510         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
511   #else
512         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
513   #endif
514         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
515         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516 
517         /* set the operation */
518         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519 
520         /* set the matrix */
521         upTriFactor->csrMat              = new CsrMatrix;
522         upTriFactor->csrMat->num_rows    = n;
523         upTriFactor->csrMat->num_cols    = n;
524         upTriFactor->csrMat->num_entries = nzUpper;
525 
526         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
527         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
528 
529         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
530         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
531 
532         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
533         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
534 
535         /* Create the solve analysis information */
536         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
537         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
538   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
539         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
540                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
541         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
542   #endif
543 
544         /* perform the solve analysis */
545         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
546                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
547 
548         PetscCallCUDA(WaitForCUDA());
549         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
550 
551         /* assign the pointer */
552         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
553         upTriFactor->AA_h                                          = AAUp;
554         PetscCallCUDA(cudaFreeHost(AiUp));
555         PetscCallCUDA(cudaFreeHost(AjUp));
556         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
557       } else {
558         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
559         /* Fill the upper triangular matrix */
560         offset = nzUpper;
561         for (i = n - 1; i >= 0; i--) {
562           v = aa + adiag[i + 1] + 1;
563 
564           /* number of elements NOT on the diagonal */
565           nz = adiag[i] - adiag[i + 1] - 1;
566 
567           /* decrement the offset */
568           offset -= (nz + 1);
569 
570           /* first, set the diagonal elements */
571           upTriFactor->AA_h[offset] = 1. / v[nz];
572           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
573         }
574         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
575         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
576       }
577     } catch (char *ex) {
578       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
579     }
580   }
581   PetscFunctionReturn(PETSC_SUCCESS);
582 }
583 #endif
584 
585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
586 {
587   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
588   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
589   IS                            isrow = a->row, iscol = a->icol;
590   PetscBool                     row_identity, col_identity;
591   PetscInt                      n = A->rmap->n;
592 
593   PetscFunctionBegin;
594   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
596   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
597 #else
598   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
599   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
600   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
601 #endif
602 
603   cusparseTriFactors->nnz = a->nz;
604 
605   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
606   /* lower triangular indices */
607   PetscCall(ISIdentity(isrow, &row_identity));
608   if (!row_identity && !cusparseTriFactors->rpermIndices) {
609     const PetscInt *r;
610 
611     PetscCall(ISGetIndices(isrow, &r));
612     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
613     cusparseTriFactors->rpermIndices->assign(r, r + n);
614     PetscCall(ISRestoreIndices(isrow, &r));
615     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
616   }
617 
618   /* upper triangular indices */
619   PetscCall(ISIdentity(iscol, &col_identity));
620   if (!col_identity && !cusparseTriFactors->cpermIndices) {
621     const PetscInt *c;
622 
623     PetscCall(ISGetIndices(iscol, &c));
624     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
625     cusparseTriFactors->cpermIndices->assign(c, c + n);
626     PetscCall(ISRestoreIndices(iscol, &c));
627     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
628   }
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
634 {
635   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
636   PetscInt                      m  = A->rmap->n;
637   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
638   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
639   const MatScalar              *Aa = a->a;
640   PetscInt                     *Mj, Mnz;
641   PetscScalar                  *Ma, *D;
642 
643   PetscFunctionBegin;
644   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
645     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
646       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
647       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
648       Mnz = Ai[m]; // Unz (with the unit diagonal)
649       PetscCall(PetscMalloc1(Mnz, &Ma));
650       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
651       PetscCall(PetscMalloc1(m, &D));    // the diagonal
652       for (PetscInt i = 0; i < m; i++) {
653         PetscInt ulen = Ai[i + 1] - Ai[i];
654         Mj[Ai[i]]     = i;                                              // diagonal entry
655         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
656       }
657       // Copy M (U) from host to device
658       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
659       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
661       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
662       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
663       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
664 
665       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
666       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
667       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
668       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
669       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
670       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
671       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
672       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
673 
674       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
676       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
677 
678       // Allocate work vectors in SpSv
679       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
680       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
681 
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
683       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
684 
685       // Query buffer sizes for SpSV and then allocate buffers
686       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
687       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
688       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
689 
690       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
691       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
692       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
693 
694       // Record for reuse
695       fs->csrVal_h = Ma;
696       fs->diag_h   = D;
697       PetscCall(PetscFree(Mj));
698     }
699     // Copy the value
700     Ma  = fs->csrVal_h;
701     D   = fs->diag_h;
702     Mnz = Ai[m];
703     for (PetscInt i = 0; i < m; i++) {
704       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
705       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
706       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
707     }
708     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
709     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
710 
711     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
713     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
714   }
715   PetscFunctionReturn(PETSC_SUCCESS);
716 }
717 
718 // Solve Ut D U x = b
719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
720 {
721   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
722   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
723   const PetscScalar                    *barray;
724   PetscScalar                          *xarray;
725   thrust::device_ptr<const PetscScalar> bGPU;
726   thrust::device_ptr<PetscScalar>       xGPU;
727   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
728   PetscInt                              m   = A->rmap->n;
729 
730   PetscFunctionBegin;
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
733   PetscCall(VecCUDAGetArrayRead(b, &barray));
734   xGPU = thrust::device_pointer_cast(xarray);
735   bGPU = thrust::device_pointer_cast(barray);
736 
737   // Reorder b with the row permutation if needed, and wrap the result in fs->X
738   if (fs->rpermIndices) {
739     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
740     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
741   } else {
742     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
743   }
744 
745   // Solve Ut Y = X
746   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
747   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
748 
749   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
750   // It is basically a vector element-wise multiplication, but cublas does not have it!
751   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
752 
753   // Solve U X = Y
754   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756   } else {
757     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758   }
759   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
760 
761   // Reorder X with the column permutation if needed, and put the result back to x
762   if (fs->cpermIndices) {
763     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765   }
766 
767   PetscCall(VecCUDARestoreArrayRead(b, &barray));
768   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769   PetscCall(PetscLogGpuTimeEnd());
770   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 #else
774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775 {
776   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
777   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780   PetscInt                          *AiUp, *AjUp;
781   PetscScalar                       *AAUp;
782   PetscScalar                       *AALo;
783   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
785   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
786   const MatScalar                   *aa = b->a, *v;
787 
788   PetscFunctionBegin;
789   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791     try {
792       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
793       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794       if (!upTriFactor && !loTriFactor) {
795         /* Allocate Space for the upper triangular matrix */
796         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
797         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
798 
799         /* Fill the upper triangular matrix */
800         AiUp[0] = (PetscInt)0;
801         AiUp[n] = nzUpper;
802         offset  = 0;
803         for (i = 0; i < n; i++) {
804           /* set the pointers */
805           v  = aa + ai[i];
806           vj = aj + ai[i];
807           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
808 
809           /* first, set the diagonal elements */
810           AjUp[offset] = (PetscInt)i;
811           AAUp[offset] = (MatScalar)1.0 / v[nz];
812           AiUp[i]      = offset;
813           AALo[offset] = (MatScalar)1.0 / v[nz];
814 
815           offset += 1;
816           if (nz > 0) {
817             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819             for (j = offset; j < offset + nz; j++) {
820               AAUp[j] = -AAUp[j];
821               AALo[j] = AAUp[j] / v[nz];
822             }
823             offset += nz;
824           }
825         }
826 
827         /* allocate space for the triangular factor information */
828         PetscCall(PetscNew(&upTriFactor));
829         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
830 
831         /* Create the matrix description */
832         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
833         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
834   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
835         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836   #else
837         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838   #endif
839         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
840         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
841 
842         /* set the matrix */
843         upTriFactor->csrMat              = new CsrMatrix;
844         upTriFactor->csrMat->num_rows    = A->rmap->n;
845         upTriFactor->csrMat->num_cols    = A->cmap->n;
846         upTriFactor->csrMat->num_entries = a->nz;
847 
848         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
850 
851         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
853 
854         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
856 
857         /* set the operation */
858         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859 
860         /* Create the solve analysis information */
861         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
863   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
864         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
865                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
866         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867   #endif
868 
869         /* perform the solve analysis */
870         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
871                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
872 
873         PetscCallCUDA(WaitForCUDA());
874         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
875 
876         /* assign the pointer */
877         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
878 
879         /* allocate space for the triangular factor information */
880         PetscCall(PetscNew(&loTriFactor));
881         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
882 
883         /* Create the matrix description */
884         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
885         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
886   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888   #else
889         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890   #endif
891         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
892         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
893 
894         /* set the operation */
895         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
896 
897         /* set the matrix */
898         loTriFactor->csrMat              = new CsrMatrix;
899         loTriFactor->csrMat->num_rows    = A->rmap->n;
900         loTriFactor->csrMat->num_cols    = A->cmap->n;
901         loTriFactor->csrMat->num_entries = a->nz;
902 
903         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
905 
906         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
908 
909         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
911 
912         /* Create the solve analysis information */
913         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
915   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
916         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
917                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
918         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919   #endif
920 
921         /* perform the solve analysis */
922         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
923                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
924 
925         PetscCallCUDA(WaitForCUDA());
926         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
927 
928         /* assign the pointer */
929         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
930 
931         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
932         PetscCallCUDA(cudaFreeHost(AiUp));
933         PetscCallCUDA(cudaFreeHost(AjUp));
934       } else {
935         /* Fill the upper triangular matrix */
936         offset = 0;
937         for (i = 0; i < n; i++) {
938           /* set the pointers */
939           v  = aa + ai[i];
940           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
941 
942           /* first, set the diagonal elements */
943           AAUp[offset] = 1.0 / v[nz];
944           AALo[offset] = 1.0 / v[nz];
945 
946           offset += 1;
947           if (nz > 0) {
948             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949             for (j = offset; j < offset + nz; j++) {
950               AAUp[j] = -AAUp[j];
951               AALo[j] = AAUp[j] / v[nz];
952             }
953             offset += nz;
954           }
955         }
956         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
960         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961       }
962       PetscCallCUDA(cudaFreeHost(AAUp));
963       PetscCallCUDA(cudaFreeHost(AALo));
964     } catch (char *ex) {
965       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966     }
967   }
968   PetscFunctionReturn(PETSC_SUCCESS);
969 }
970 #endif
971 
972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973 {
974   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
975   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976   IS                            ip                 = a->row;
977   PetscBool                     perm_identity;
978   PetscInt                      n = A->rmap->n;
979 
980   PetscFunctionBegin;
981   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
982 
983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
985 #else
986   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988 #endif
989   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
990 
991   A->offloadmask = PETSC_OFFLOAD_BOTH;
992 
993   /* lower triangular indices */
994   PetscCall(ISIdentity(ip, &perm_identity));
995   if (!perm_identity) {
996     IS              iip;
997     const PetscInt *irip, *rip;
998 
999     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1000     PetscCall(ISGetIndices(iip, &irip));
1001     PetscCall(ISGetIndices(ip, &rip));
1002     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1005     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1006     PetscCall(ISRestoreIndices(iip, &irip));
1007     PetscCall(ISDestroy(&iip));
1008     PetscCall(ISRestoreIndices(ip, &rip));
1009     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010   }
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015 {
1016   PetscFunctionBegin;
1017   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1018   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019   B->offloadmask = PETSC_OFFLOAD_CPU;
1020 
1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024 #else
1025   /* determine which version of MatSolve needs to be used. */
1026   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1027   IS          ip = b->row;
1028   PetscBool   perm_identity;
1029 
1030   PetscCall(ISIdentity(ip, &perm_identity));
1031   if (perm_identity) {
1032     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034   } else {
1035     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1036     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037   }
1038 #endif
1039   B->ops->matsolve          = NULL;
1040   B->ops->matsolvetranspose = NULL;
1041 
1042   /* get the triangular factors */
1043   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1044   PetscFunctionReturn(PETSC_SUCCESS);
1045 }
1046 
1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049 {
1050   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055   cusparseIndexBase_t                indexBase;
1056   cusparseMatrixType_t               matrixType;
1057   cusparseFillMode_t                 fillMode;
1058   cusparseDiagType_t                 diagType;
1059 
1060   PetscFunctionBegin;
1061   /* allocate space for the transpose of the lower triangular factor */
1062   PetscCall(PetscNew(&loTriFactorT));
1063   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1064 
1065   /* set the matrix descriptors of the lower triangular factor */
1066   matrixType = cusparseGetMatType(loTriFactor->descr);
1067   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1068   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1070 
1071   /* Create the matrix description */
1072   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1073   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1074   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1075   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1076   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1077 
1078   /* set the operation */
1079   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1080 
1081   /* allocate GPU space for the CSC of the lower triangular factor*/
1082   loTriFactorT->csrMat                 = new CsrMatrix;
1083   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1084   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1085   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1086   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1089 
1090   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1092   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1093                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1095   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096   #endif
1097 
1098   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1099   {
1100     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1101     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1102                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1104                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105   #else
1106                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107   #endif
1108     PetscCallCUSPARSE(stat);
1109   }
1110 
1111   PetscCallCUDA(WaitForCUDA());
1112   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1113 
1114   /* Create the solve analysis information */
1115   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1117   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1118   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1119                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1120   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121   #endif
1122 
1123   /* perform the solve analysis */
1124   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1125                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1126 
1127   PetscCallCUDA(WaitForCUDA());
1128   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1129 
1130   /* assign the pointer */
1131   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1132 
1133   /*********************************************/
1134   /* Now the Transpose of the Upper Tri Factor */
1135   /*********************************************/
1136 
1137   /* allocate space for the transpose of the upper triangular factor */
1138   PetscCall(PetscNew(&upTriFactorT));
1139   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1140 
1141   /* set the matrix descriptors of the upper triangular factor */
1142   matrixType = cusparseGetMatType(upTriFactor->descr);
1143   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1144   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1146 
1147   /* Create the matrix description */
1148   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1149   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1150   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1151   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1152   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1153 
1154   /* set the operation */
1155   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1156 
1157   /* allocate GPU space for the CSC of the upper triangular factor*/
1158   upTriFactorT->csrMat                 = new CsrMatrix;
1159   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1160   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1161   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1162   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1165 
1166   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1168   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1169                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1171   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172   #endif
1173 
1174   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1175   {
1176     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1177     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1178                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1180                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181   #else
1182                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183   #endif
1184     PetscCallCUSPARSE(stat);
1185   }
1186 
1187   PetscCallCUDA(WaitForCUDA());
1188   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1189 
1190   /* Create the solve analysis information */
1191   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1193   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1194   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1195                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1196   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197   #endif
1198 
1199   /* perform the solve analysis */
1200   /* christ, would it have killed you to put this stuff in a function????????? */
1201   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1202                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1203 
1204   PetscCallCUDA(WaitForCUDA());
1205   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1206 
1207   /* assign the pointer */
1208   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 #endif
1212 
1213 struct PetscScalarToPetscInt {
1214   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215 };
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1222   cusparseStatus_t              stat;
1223   cusparseIndexBase_t           indexBase;
1224 
1225   PetscFunctionBegin;
1226   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1228   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1230   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1231   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1232   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1233   PetscCall(PetscLogGpuTimeBegin());
1234   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1240     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1241 
1242     /* set alpha and beta */
1243     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1246     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1249 
1250     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251       CsrMatrix *matrixT      = new CsrMatrix;
1252       matstructT->mat         = matrixT;
1253       matrixT->num_rows       = A->cmap->n;
1254       matrixT->num_cols       = A->rmap->n;
1255       matrixT->num_entries    = a->nz;
1256       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258       matrixT->values         = new THRUSTARRAY(a->nz);
1259 
1260       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1261       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1262 
1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1264   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1265       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266                                indexBase, cusparse_scalartype);
1267       PetscCallCUSPARSE(stat);
1268   #else
1269       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1270            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1271 
1272            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1273            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1274            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1275         */
1276       if (matrixT->num_entries) {
1277         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1278         PetscCallCUSPARSE(stat);
1279 
1280       } else {
1281         matstructT->matDescr = NULL;
1282         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1283       }
1284   #endif
1285 #endif
1286     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289 #else
1290       CsrMatrix *temp  = new CsrMatrix;
1291       CsrMatrix *tempT = new CsrMatrix;
1292       /* First convert HYB to CSR */
1293       temp->num_rows       = A->rmap->n;
1294       temp->num_cols       = A->cmap->n;
1295       temp->num_entries    = a->nz;
1296       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1297       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298       temp->values         = new THRUSTARRAY(a->nz);
1299 
1300       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1301       PetscCallCUSPARSE(stat);
1302 
1303       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304       tempT->num_rows       = A->rmap->n;
1305       tempT->num_cols       = A->cmap->n;
1306       tempT->num_entries    = a->nz;
1307       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1308       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309       tempT->values         = new THRUSTARRAY(a->nz);
1310 
1311       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1312                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1313       PetscCallCUSPARSE(stat);
1314 
1315       /* Last, convert CSC to HYB */
1316       cusparseHybMat_t hybMat;
1317       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1319       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1320       PetscCallCUSPARSE(stat);
1321 
1322       /* assign the pointer */
1323       matstructT->mat = hybMat;
1324       A->transupdated = PETSC_TRUE;
1325       /* delete temporaries */
1326       if (tempT) {
1327         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330         delete (CsrMatrix *)tempT;
1331       }
1332       if (temp) {
1333         if (temp->values) delete (THRUSTARRAY *)temp->values;
1334         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336         delete (CsrMatrix *)temp;
1337       }
1338 #endif
1339     }
1340   }
1341   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1343     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1344     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1345     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1346     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1347     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1348     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1349     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1350     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1351     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1355       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356     }
1357     if (!cusparsestruct->csr2csc_i) {
1358       THRUSTARRAY csr2csc_a(matrix->num_entries);
1359       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1360 
1361       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363       void  *csr2cscBuffer;
1364       size_t csr2cscBufferSize;
1365       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1366                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1367       PetscCallCUSPARSE(stat);
1368       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369 #endif
1370 
1371       if (matrix->num_entries) {
1372         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375 
1376            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377            should be filled with indexBase. So I just take a shortcut here.
1378         */
1379         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1382         PetscCallCUSPARSE(stat);
1383 #else
1384                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1385         PetscCallCUSPARSE(stat);
1386 #endif
1387       } else {
1388         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1389       }
1390 
1391       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1394       PetscCallCUDA(cudaFree(csr2cscBuffer));
1395 #endif
1396     }
1397     PetscCallThrust(
1398       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399   }
1400   PetscCall(PetscLogGpuTimeEnd());
1401   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402   /* the compressed row indices is not used for matTranspose */
1403   matstructT->cprowIndices = NULL;
1404   /* assign the pointer */
1405   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1406   A->transupdated                                = PETSC_TRUE;
1407   PetscFunctionReturn(PETSC_SUCCESS);
1408 }
1409 
1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412 {
1413   const PetscScalar                    *barray;
1414   PetscScalar                          *xarray;
1415   thrust::device_ptr<const PetscScalar> bGPU;
1416   thrust::device_ptr<PetscScalar>       xGPU;
1417   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421   PetscInt                              m   = A->rmap->n;
1422 
1423   PetscFunctionBegin;
1424   PetscCall(PetscLogGpuTimeBegin());
1425   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426   PetscCall(VecCUDAGetArrayRead(b, &barray));
1427   xGPU = thrust::device_pointer_cast(xarray);
1428   bGPU = thrust::device_pointer_cast(barray);
1429 
1430   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431   if (fs->rpermIndices) {
1432     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434   } else {
1435     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436   }
1437 
1438   // Solve L Y = X
1439   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1442 
1443   // Solve U X = Y
1444   if (fs->cpermIndices) {
1445     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446   } else {
1447     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448   }
1449   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1450 
1451   // Reorder X with the column permutation if needed, and put the result back to x
1452   if (fs->cpermIndices) {
1453     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455   }
1456   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458   PetscCall(PetscLogGpuTimeEnd());
1459   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464 {
1465   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467   const PetscScalar                    *barray;
1468   PetscScalar                          *xarray;
1469   thrust::device_ptr<const PetscScalar> bGPU;
1470   thrust::device_ptr<PetscScalar>       xGPU;
1471   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1472   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473   PetscInt                              m   = A->rmap->n;
1474 
1475   PetscFunctionBegin;
1476   PetscCall(PetscLogGpuTimeBegin());
1477   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1481 
1482     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487   }
1488 
1489   if (!fs->updatedTransposeSpSVAnalysis) {
1490     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1491 
1492     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494   }
1495 
1496   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497   PetscCall(VecCUDAGetArrayRead(b, &barray));
1498   xGPU = thrust::device_pointer_cast(xarray);
1499   bGPU = thrust::device_pointer_cast(barray);
1500 
1501   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502   if (fs->rpermIndices) {
1503     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505   } else {
1506     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507   }
1508 
1509   // Solve Ut Y = X
1510   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1512 
1513   // Solve Lt X = Y
1514   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516   } else {
1517     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518   }
1519   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1520 
1521   // Reorder X with the column permutation if needed, and put the result back to x
1522   if (fs->cpermIndices) {
1523     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525   }
1526 
1527   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529   PetscCall(PetscLogGpuTimeEnd());
1530   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 #else
1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536 {
1537   PetscInt                              n = xx->map->n;
1538   const PetscScalar                    *barray;
1539   PetscScalar                          *xarray;
1540   thrust::device_ptr<const PetscScalar> bGPU;
1541   thrust::device_ptr<PetscScalar>       xGPU;
1542   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1546 
1547   PetscFunctionBegin;
1548   /* Analyze the matrix and create the transpose ... on the fly */
1549   if (!loTriFactorT && !upTriFactorT) {
1550     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553   }
1554 
1555   /* Get the GPU pointers */
1556   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1557   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558   xGPU = thrust::device_pointer_cast(xarray);
1559   bGPU = thrust::device_pointer_cast(barray);
1560 
1561   PetscCall(PetscLogGpuTimeBegin());
1562   /* First, reorder with the row permutation */
1563   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1564 
1565   /* First, solve U */
1566   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1567                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1568 
1569   /* Then, solve L */
1570   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1571                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1572 
1573   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1575 
1576   /* Copy the temporary to the full solution. */
1577   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1578 
1579   /* restore */
1580   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1581   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1582   PetscCall(PetscLogGpuTimeEnd());
1583   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1584   PetscFunctionReturn(PETSC_SUCCESS);
1585 }
1586 
1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588 {
1589   const PetscScalar                 *barray;
1590   PetscScalar                       *xarray;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1595 
1596   PetscFunctionBegin;
1597   /* Analyze the matrix and create the transpose ... on the fly */
1598   if (!loTriFactorT && !upTriFactorT) {
1599     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602   }
1603 
1604   /* Get the GPU pointers */
1605   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1606   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1607 
1608   PetscCall(PetscLogGpuTimeBegin());
1609   /* First, solve U */
1610   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1611                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1612 
1613   /* Then, solve L */
1614   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1615                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1616 
1617   /* restore */
1618   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1619   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1620   PetscCall(PetscLogGpuTimeEnd());
1621   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1622   PetscFunctionReturn(PETSC_SUCCESS);
1623 }
1624 
1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626 {
1627   const PetscScalar                    *barray;
1628   PetscScalar                          *xarray;
1629   thrust::device_ptr<const PetscScalar> bGPU;
1630   thrust::device_ptr<PetscScalar>       xGPU;
1631   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637   /* Get the GPU pointers */
1638   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1639   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640   xGPU = thrust::device_pointer_cast(xarray);
1641   bGPU = thrust::device_pointer_cast(barray);
1642 
1643   PetscCall(PetscLogGpuTimeBegin());
1644   /* First, reorder with the row permutation */
1645   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1646 
1647   /* Next, solve L */
1648   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1649                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1650 
1651   /* Then, solve U */
1652   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1653                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1654 
1655   /* Last, reorder with the column permutation */
1656   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1657 
1658   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1659   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1660   PetscCall(PetscLogGpuTimeEnd());
1661   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666 {
1667   const PetscScalar                 *barray;
1668   PetscScalar                       *xarray;
1669   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1673 
1674   PetscFunctionBegin;
1675   /* Get the GPU pointers */
1676   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1677   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1678 
1679   PetscCall(PetscLogGpuTimeBegin());
1680   /* First, solve L */
1681   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1682                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1683 
1684   /* Next, solve U */
1685   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1686                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1687 
1688   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1689   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1690   PetscCall(PetscLogGpuTimeEnd());
1691   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1692   PetscFunctionReturn(PETSC_SUCCESS);
1693 }
1694 #endif
1695 
1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698 {
1699   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1701   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702   CsrMatrix                    *Acsr;
1703   PetscInt                      m, nz;
1704   PetscBool                     flg;
1705 
1706   PetscFunctionBegin;
1707   if (PetscDefined(USE_DEBUG)) {
1708     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710   }
1711 
1712   /* Copy A's value to fact */
1713   m  = fact->rmap->n;
1714   nz = aij->nz;
1715   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716   Acsr = (CsrMatrix *)Acusp->mat->mat;
1717   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1718 
1719   PetscCall(PetscLogGpuTimeBegin());
1720   /* Factorize fact inplace */
1721   if (m)
1722     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724   if (PetscDefined(USE_DEBUG)) {
1725     int              numerical_zero;
1726     cusparseStatus_t status;
1727     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729   }
1730 
1731   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1732      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1733   */
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1735 
1736   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1737 
1738   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1739   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740 
1741   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1742   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1743   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1744   fact->ops->matsolve          = NULL;
1745   fact->ops->matsolvetranspose = NULL;
1746   PetscCall(PetscLogGpuTimeEnd());
1747   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1748   PetscFunctionReturn(PETSC_SUCCESS);
1749 }
1750 
1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1752 {
1753   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1754   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1755   PetscInt                      m, nz;
1756 
1757   PetscFunctionBegin;
1758   if (PetscDefined(USE_DEBUG)) {
1759     PetscInt  i;
1760     PetscBool flg, missing;
1761 
1762     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1763     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1764     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1765     PetscCall(MatMissingDiagonal(A, &missing, &i));
1766     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1767   }
1768 
1769   /* Free the old stale stuff */
1770   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1771 
1772   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1773      but they will not be used. Allocate them just for easy debugging.
1774    */
1775   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1776 
1777   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1778   fact->factortype             = MAT_FACTOR_ILU;
1779   fact->info.factor_mallocs    = 0;
1780   fact->info.fill_ratio_given  = info->fill;
1781   fact->info.fill_ratio_needed = 1.0;
1782 
1783   aij->row = NULL;
1784   aij->col = NULL;
1785 
1786   /* ====================================================================== */
1787   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1788   /* We'll do in-place factorization on fact                                */
1789   /* ====================================================================== */
1790   const int *Ai, *Aj;
1791 
1792   m  = fact->rmap->n;
1793   nz = aij->nz;
1794 
1795   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1796   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1797   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1798   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1799   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801 
1802   /* ====================================================================== */
1803   /* Create descriptors for M, L, U                                         */
1804   /* ====================================================================== */
1805   cusparseFillMode_t fillMode;
1806   cusparseDiagType_t diagType;
1807 
1808   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1809   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1810   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1811 
1812   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1813     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1814     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1815     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1816     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1817   */
1818   fillMode = CUSPARSE_FILL_MODE_LOWER;
1819   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1820   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1821   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1822   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1823 
1824   fillMode = CUSPARSE_FILL_MODE_UPPER;
1825   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1826   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1827   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1828   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1829 
1830   /* ========================================================================= */
1831   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1832   /* ========================================================================= */
1833   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1834   if (m)
1835     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1836                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1837 
1838   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1839   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1840 
1841   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1842   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1846 
1847   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1848   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1849 
1850   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1851      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1852      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1853      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1854    */
1855   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1856     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1857     fs->spsvBuffer_L = fs->factBuffer_M;
1858     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1859   } else {
1860     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1861     fs->spsvBuffer_U = fs->factBuffer_M;
1862     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1863   }
1864 
1865   /* ========================================================================== */
1866   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1867   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1868   /* ========================================================================== */
1869   int              structural_zero;
1870   cusparseStatus_t status;
1871 
1872   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1873   if (m)
1874     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1875                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1876   if (PetscDefined(USE_DEBUG)) {
1877     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1878     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1879     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1880   }
1881 
1882   /* Estimate FLOPs of the numeric factorization */
1883   {
1884     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1885     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1886     PetscLogDouble flops = 0.0;
1887 
1888     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1889     Ai    = Aseq->i;
1890     Adiag = Aseq->diag;
1891     for (PetscInt i = 0; i < m; i++) {
1892       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1893         nzRow  = Ai[i + 1] - Ai[i];
1894         nzLeft = Adiag[i] - Ai[i];
1895         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1896           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1897         */
1898         nzLeft = (nzRow - 1) / 2;
1899         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1900       }
1901     }
1902     fs->numericFactFlops = flops;
1903   }
1904   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1905   PetscFunctionReturn(PETSC_SUCCESS);
1906 }
1907 
1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1909 {
1910   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1911   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1912   const PetscScalar            *barray;
1913   PetscScalar                  *xarray;
1914 
1915   PetscFunctionBegin;
1916   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1917   PetscCall(VecCUDAGetArrayRead(b, &barray));
1918   PetscCall(PetscLogGpuTimeBegin());
1919 
1920   /* Solve L*y = b */
1921   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1922   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1923   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1924                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1925 
1926   /* Solve Lt*x = y */
1927   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1928   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1929                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1930 
1931   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1932   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1933 
1934   PetscCall(PetscLogGpuTimeEnd());
1935   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1936   PetscFunctionReturn(PETSC_SUCCESS);
1937 }
1938 
1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1940 {
1941   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1942   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1943   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1944   CsrMatrix                    *Acsr;
1945   PetscInt                      m, nz;
1946   PetscBool                     flg;
1947 
1948   PetscFunctionBegin;
1949   if (PetscDefined(USE_DEBUG)) {
1950     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1952   }
1953 
1954   /* Copy A's value to fact */
1955   m  = fact->rmap->n;
1956   nz = aij->nz;
1957   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1958   Acsr = (CsrMatrix *)Acusp->mat->mat;
1959   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1960 
1961   /* Factorize fact inplace */
1962   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1963      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1964      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1965      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1966      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1967    */
1968   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1969   if (PetscDefined(USE_DEBUG)) {
1970     int              numerical_zero;
1971     cusparseStatus_t status;
1972     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1973     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1974   }
1975 
1976   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1977 
1978   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1979     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1980   */
1981   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1982 
1983   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1984   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1985   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1986   fact->ops->matsolve          = NULL;
1987   fact->ops->matsolvetranspose = NULL;
1988   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1989   PetscFunctionReturn(PETSC_SUCCESS);
1990 }
1991 
1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1993 {
1994   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1995   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1996   PetscInt                      m, nz;
1997 
1998   PetscFunctionBegin;
1999   if (PetscDefined(USE_DEBUG)) {
2000     PetscInt  i;
2001     PetscBool flg, missing;
2002 
2003     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2004     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2005     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2006     PetscCall(MatMissingDiagonal(A, &missing, &i));
2007     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2008   }
2009 
2010   /* Free the old stale stuff */
2011   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2012 
2013   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2014      but they will not be used. Allocate them just for easy debugging.
2015    */
2016   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2017 
2018   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2019   fact->factortype             = MAT_FACTOR_ICC;
2020   fact->info.factor_mallocs    = 0;
2021   fact->info.fill_ratio_given  = info->fill;
2022   fact->info.fill_ratio_needed = 1.0;
2023 
2024   aij->row = NULL;
2025   aij->col = NULL;
2026 
2027   /* ====================================================================== */
2028   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2029   /* We'll do in-place factorization on fact                                */
2030   /* ====================================================================== */
2031   const int *Ai, *Aj;
2032 
2033   m  = fact->rmap->n;
2034   nz = aij->nz;
2035 
2036   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2037   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2038   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2039   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2040   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2042 
2043   /* ====================================================================== */
2044   /* Create mat descriptors for M, L                                        */
2045   /* ====================================================================== */
2046   cusparseFillMode_t fillMode;
2047   cusparseDiagType_t diagType;
2048 
2049   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2050   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2051   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2052 
2053   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2054     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2055     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2056     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2057     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2058   */
2059   fillMode = CUSPARSE_FILL_MODE_LOWER;
2060   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2061   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2062   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2063   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2064 
2065   /* ========================================================================= */
2066   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2067   /* ========================================================================= */
2068   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2069   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2070 
2071   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2072   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2073 
2074   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2075   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2079 
2080   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2081   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2082 
2083   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2084      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2085    */
2086   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2087     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2088     fs->spsvBuffer_L = fs->factBuffer_M;
2089     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2090   } else {
2091     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2092     fs->spsvBuffer_Lt = fs->factBuffer_M;
2093     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2094   }
2095 
2096   /* ========================================================================== */
2097   /* Perform analysis of ic0 on M                                               */
2098   /* The lower triangular part of M has the same sparsity pattern as L          */
2099   /* ========================================================================== */
2100   int              structural_zero;
2101   cusparseStatus_t status;
2102 
2103   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2104   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2105   if (PetscDefined(USE_DEBUG)) {
2106     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2107     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2108     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2109   }
2110 
2111   /* Estimate FLOPs of the numeric factorization */
2112   {
2113     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2114     PetscInt      *Ai, nzRow, nzLeft;
2115     PetscLogDouble flops = 0.0;
2116 
2117     Ai = Aseq->i;
2118     for (PetscInt i = 0; i < m; i++) {
2119       nzRow = Ai[i + 1] - Ai[i];
2120       if (nzRow > 1) {
2121         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2122           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2123         */
2124         nzLeft = (nzRow - 1) / 2;
2125         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2126       }
2127     }
2128     fs->numericFactFlops = flops;
2129   }
2130   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2131   PetscFunctionReturn(PETSC_SUCCESS);
2132 }
2133 #endif
2134 
2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2136 {
2137   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2138   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2139 
2140   PetscFunctionBegin;
2141   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2142   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2143   B->offloadmask = PETSC_OFFLOAD_CPU;
2144 
2145   if (!cusparsestruct->use_cpu_solve) {
2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2147     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2148     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2149 #else
2150     /* determine which version of MatSolve needs to be used. */
2151     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2152     IS          isrow = b->row, iscol = b->col;
2153     PetscBool   row_identity, col_identity;
2154 
2155     PetscCall(ISIdentity(isrow, &row_identity));
2156     PetscCall(ISIdentity(iscol, &col_identity));
2157     if (row_identity && col_identity) {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2160     } else {
2161       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2162       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2163     }
2164 #endif
2165   }
2166   B->ops->matsolve          = NULL;
2167   B->ops->matsolvetranspose = NULL;
2168 
2169   /* get the triangular factors */
2170   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2175 {
2176   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2177 
2178   PetscFunctionBegin;
2179   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2180   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2181   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2182   PetscFunctionReturn(PETSC_SUCCESS);
2183 }
2184 
2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186 {
2187   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2188 
2189   PetscFunctionBegin;
2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2191   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2192   if (cusparseTriFactors->factorizeOnDevice) {
2193     PetscCall(ISIdentity(isrow, &row_identity));
2194     PetscCall(ISIdentity(iscol, &col_identity));
2195   }
2196   if (!info->levels && row_identity && col_identity) {
2197     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2198   } else
2199 #endif
2200   {
2201     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2202     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2203     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2204   }
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2209 {
2210   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2211 
2212   PetscFunctionBegin;
2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2214   PetscBool perm_identity = PETSC_FALSE;
2215   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2216   if (!info->levels && perm_identity) {
2217     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2218   } else
2219 #endif
2220   {
2221     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2222     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2223     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2224   }
2225   PetscFunctionReturn(PETSC_SUCCESS);
2226 }
2227 
2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2229 {
2230   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2231 
2232   PetscFunctionBegin;
2233   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2234   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2235   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2236   PetscFunctionReturn(PETSC_SUCCESS);
2237 }
2238 
2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2240 {
2241   PetscFunctionBegin;
2242   *type = MATSOLVERCUSPARSE;
2243   PetscFunctionReturn(PETSC_SUCCESS);
2244 }
2245 
2246 /*MC
2247   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2248   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2249   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2250   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2251   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2252   algorithms are not recommended. This class does NOT support direct solver operations.
2253 
2254   Level: beginner
2255 
2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2257           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2258 M*/
2259 
2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2261 {
2262   PetscInt  n = A->rmap->n;
2263   PetscBool factOnDevice, factOnHost;
2264   char     *prefix;
2265   char      factPlace[32] = "device"; /* the default */
2266 
2267   PetscFunctionBegin;
2268   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2269   PetscCall(MatSetSizes(*B, n, n, n, n));
2270   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2271   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2272 
2273   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2274   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
2275   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2276   PetscOptionsEnd();
2277   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2278   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2279   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2280   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2281 
2282   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2283   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2284     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2285     if (!A->boundtocpu) {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2288     } else {
2289       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2290       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2291     }
2292     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2293     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2294     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2295   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2296     if (!A->boundtocpu) {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2299     } else {
2300       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2301       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2302     }
2303     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2304     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2305   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2306 
2307   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2308   (*B)->canuseordering = PETSC_TRUE;
2309   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2310   PetscFunctionReturn(PETSC_SUCCESS);
2311 }
2312 
2313 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2314 {
2315   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2316   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2318   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2319 #endif
2320 
2321   PetscFunctionBegin;
2322   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2323     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2324     if (A->factortype == MAT_FACTOR_NONE) {
2325       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2326       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2327     }
2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2329     else if (fs->csrVal) {
2330       /* We have a factorized matrix on device and are able to copy it to host */
2331       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2332     }
2333 #endif
2334     else
2335       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2336     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2337     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2338     A->offloadmask = PETSC_OFFLOAD_BOTH;
2339   }
2340   PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342 
2343 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2344 {
2345   PetscFunctionBegin;
2346   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2347   *array = ((Mat_SeqAIJ *)A->data)->a;
2348   PetscFunctionReturn(PETSC_SUCCESS);
2349 }
2350 
2351 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2352 {
2353   PetscFunctionBegin;
2354   A->offloadmask = PETSC_OFFLOAD_CPU;
2355   *array         = NULL;
2356   PetscFunctionReturn(PETSC_SUCCESS);
2357 }
2358 
2359 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2360 {
2361   PetscFunctionBegin;
2362   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2363   *array = ((Mat_SeqAIJ *)A->data)->a;
2364   PetscFunctionReturn(PETSC_SUCCESS);
2365 }
2366 
2367 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2368 {
2369   PetscFunctionBegin;
2370   *array = NULL;
2371   PetscFunctionReturn(PETSC_SUCCESS);
2372 }
2373 
2374 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375 {
2376   PetscFunctionBegin;
2377   *array = ((Mat_SeqAIJ *)A->data)->a;
2378   PetscFunctionReturn(PETSC_SUCCESS);
2379 }
2380 
2381 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2382 {
2383   PetscFunctionBegin;
2384   A->offloadmask = PETSC_OFFLOAD_CPU;
2385   *array         = NULL;
2386   PetscFunctionReturn(PETSC_SUCCESS);
2387 }
2388 
2389 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2390 {
2391   Mat_SeqAIJCUSPARSE *cusp;
2392   CsrMatrix          *matrix;
2393 
2394   PetscFunctionBegin;
2395   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2396   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2397   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2398   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2399   matrix = (CsrMatrix *)cusp->mat->mat;
2400 
2401   if (i) {
2402 #if !defined(PETSC_USE_64BIT_INDICES)
2403     *i = matrix->row_offsets->data().get();
2404 #else
2405     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2406 #endif
2407   }
2408   if (j) {
2409 #if !defined(PETSC_USE_64BIT_INDICES)
2410     *j = matrix->column_indices->data().get();
2411 #else
2412     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2413 #endif
2414   }
2415   if (a) *a = matrix->values->data().get();
2416   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2417   PetscFunctionReturn(PETSC_SUCCESS);
2418 }
2419 
2420 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2421 {
2422   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2423   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2424   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2425   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2426   cusparseStatus_t              stat;
2427   PetscBool                     both = PETSC_TRUE;
2428 
2429   PetscFunctionBegin;
2430   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2431   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2432     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2433       CsrMatrix *matrix;
2434       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2435 
2436       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2437       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2438       matrix->values->assign(a->a, a->a + a->nz);
2439       PetscCallCUDA(WaitForCUDA());
2440       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2441       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2442       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2443     } else {
2444       PetscInt nnz;
2445       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2446       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2447       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2448       delete cusparsestruct->workVector;
2449       delete cusparsestruct->rowoffsets_gpu;
2450       cusparsestruct->workVector     = NULL;
2451       cusparsestruct->rowoffsets_gpu = NULL;
2452       try {
2453         if (a->compressedrow.use) {
2454           m    = a->compressedrow.nrows;
2455           ii   = a->compressedrow.i;
2456           ridx = a->compressedrow.rindex;
2457         } else {
2458           m    = A->rmap->n;
2459           ii   = a->i;
2460           ridx = NULL;
2461         }
2462         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2463         if (!a->a) {
2464           nnz  = ii[m];
2465           both = PETSC_FALSE;
2466         } else nnz = a->nz;
2467         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2468 
2469         /* create cusparse matrix */
2470         cusparsestruct->nrows = m;
2471         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2472         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2473         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2474         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2475 
2476         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2477         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2478         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2479         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2481         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2482         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2483 
2484         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2485         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2486           /* set the matrix */
2487           CsrMatrix *mat   = new CsrMatrix;
2488           mat->num_rows    = m;
2489           mat->num_cols    = A->cmap->n;
2490           mat->num_entries = nnz;
2491           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2492           mat->row_offsets->assign(ii, ii + m + 1);
2493 
2494           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2495           mat->column_indices->assign(a->j, a->j + nnz);
2496 
2497           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2498           if (a->a) mat->values->assign(a->a, a->a + nnz);
2499 
2500           /* assign the pointer */
2501           matstruct->mat = mat;
2502 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2503           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2504             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2505                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2506             PetscCallCUSPARSE(stat);
2507           }
2508 #endif
2509         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2510 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2511           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2512 #else
2513           CsrMatrix *mat   = new CsrMatrix;
2514           mat->num_rows    = m;
2515           mat->num_cols    = A->cmap->n;
2516           mat->num_entries = nnz;
2517           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2518           mat->row_offsets->assign(ii, ii + m + 1);
2519 
2520           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2521           mat->column_indices->assign(a->j, a->j + nnz);
2522 
2523           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2524           if (a->a) mat->values->assign(a->a, a->a + nnz);
2525 
2526           cusparseHybMat_t hybMat;
2527           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2528           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2529           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2530           PetscCallCUSPARSE(stat);
2531           /* assign the pointer */
2532           matstruct->mat = hybMat;
2533 
2534           if (mat) {
2535             if (mat->values) delete (THRUSTARRAY *)mat->values;
2536             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2537             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2538             delete (CsrMatrix *)mat;
2539           }
2540 #endif
2541         }
2542 
2543         /* assign the compressed row indices */
2544         if (a->compressedrow.use) {
2545           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2546           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2547           matstruct->cprowIndices->assign(ridx, ridx + m);
2548           tmp = m;
2549         } else {
2550           cusparsestruct->workVector = NULL;
2551           matstruct->cprowIndices    = NULL;
2552           tmp                        = 0;
2553         }
2554         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2555 
2556         /* assign the pointer */
2557         cusparsestruct->mat = matstruct;
2558       } catch (char *ex) {
2559         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2560       }
2561       PetscCallCUDA(WaitForCUDA());
2562       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2563       cusparsestruct->nonzerostate = A->nonzerostate;
2564     }
2565     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2566   }
2567   PetscFunctionReturn(PETSC_SUCCESS);
2568 }
2569 
2570 struct VecCUDAPlusEquals {
2571   template <typename Tuple>
2572   __host__ __device__ void operator()(Tuple t)
2573   {
2574     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2575   }
2576 };
2577 
2578 struct VecCUDAEquals {
2579   template <typename Tuple>
2580   __host__ __device__ void operator()(Tuple t)
2581   {
2582     thrust::get<1>(t) = thrust::get<0>(t);
2583   }
2584 };
2585 
2586 struct VecCUDAEqualsReverse {
2587   template <typename Tuple>
2588   __host__ __device__ void operator()(Tuple t)
2589   {
2590     thrust::get<0>(t) = thrust::get<1>(t);
2591   }
2592 };
2593 
2594 struct MatMatCusparse {
2595   PetscBool      cisdense;
2596   PetscScalar   *Bt;
2597   Mat            X;
2598   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2599   PetscLogDouble flops;
2600   CsrMatrix     *Bcsr;
2601 
2602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2603   cusparseSpMatDescr_t matSpBDescr;
2604   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2605   cusparseDnMatDescr_t matBDescr;
2606   cusparseDnMatDescr_t matCDescr;
2607   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2608   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2609   void *dBuffer4;
2610   void *dBuffer5;
2611   #endif
2612   size_t                mmBufferSize;
2613   void                 *mmBuffer;
2614   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2615   cusparseSpGEMMDescr_t spgemmDesc;
2616 #endif
2617 };
2618 
2619 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2620 {
2621   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2622 
2623   PetscFunctionBegin;
2624   PetscCallCUDA(cudaFree(mmdata->Bt));
2625   delete mmdata->Bcsr;
2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2627   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2628   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2629   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2630   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2631   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2632   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2633   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2634   #endif
2635   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2636   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2637 #endif
2638   PetscCall(MatDestroy(&mmdata->X));
2639   PetscCall(PetscFree(data));
2640   PetscFunctionReturn(PETSC_SUCCESS);
2641 }
2642 
2643 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2644 
2645 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2646 {
2647   Mat_Product                  *product = C->product;
2648   Mat                           A, B;
2649   PetscInt                      m, n, blda, clda;
2650   PetscBool                     flg, biscuda;
2651   Mat_SeqAIJCUSPARSE           *cusp;
2652   cusparseStatus_t              stat;
2653   cusparseOperation_t           opA;
2654   const PetscScalar            *barray;
2655   PetscScalar                  *carray;
2656   MatMatCusparse               *mmdata;
2657   Mat_SeqAIJCUSPARSEMultStruct *mat;
2658   CsrMatrix                    *csrmat;
2659 
2660   PetscFunctionBegin;
2661   MatCheckProduct(C, 1);
2662   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2663   mmdata = (MatMatCusparse *)product->data;
2664   A      = product->A;
2665   B      = product->B;
2666   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2667   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2668   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2669      Instead of silently accepting the wrong answer, I prefer to raise the error */
2670   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2671   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2672   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2673   switch (product->type) {
2674   case MATPRODUCT_AB:
2675   case MATPRODUCT_PtAP:
2676     mat = cusp->mat;
2677     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2678     m   = A->rmap->n;
2679     n   = B->cmap->n;
2680     break;
2681   case MATPRODUCT_AtB:
2682     if (!A->form_explicit_transpose) {
2683       mat = cusp->mat;
2684       opA = CUSPARSE_OPERATION_TRANSPOSE;
2685     } else {
2686       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2687       mat = cusp->matTranspose;
2688       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2689     }
2690     m = A->cmap->n;
2691     n = B->cmap->n;
2692     break;
2693   case MATPRODUCT_ABt:
2694   case MATPRODUCT_RARt:
2695     mat = cusp->mat;
2696     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2697     m   = A->rmap->n;
2698     n   = B->rmap->n;
2699     break;
2700   default:
2701     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2702   }
2703   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2704   csrmat = (CsrMatrix *)mat->mat;
2705   /* if the user passed a CPU matrix, copy the data to the GPU */
2706   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2707   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2708   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2709 
2710   PetscCall(MatDenseGetLDA(B, &blda));
2711   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2714   } else {
2715     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2716     PetscCall(MatDenseGetLDA(C, &clda));
2717   }
2718 
2719   PetscCall(PetscLogGpuTimeBegin());
2720 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2721   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2722   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2723   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2724   #else
2725   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2726   #endif
2727 
2728   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2729   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2730     size_t mmBufferSize;
2731     if (mmdata->initialized && mmdata->Blda != blda) {
2732       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2733       mmdata->matBDescr = NULL;
2734     }
2735     if (!mmdata->matBDescr) {
2736       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737       mmdata->Blda = blda;
2738     }
2739 
2740     if (mmdata->initialized && mmdata->Clda != clda) {
2741       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2742       mmdata->matCDescr = NULL;
2743     }
2744     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2745       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2746       mmdata->Clda = clda;
2747     }
2748 
2749   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2750     if (matADescr) {
2751       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2752       matADescr = NULL;
2753     }
2754   #endif
2755 
2756     if (!matADescr) {
2757       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2758                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2759       PetscCallCUSPARSE(stat);
2760     }
2761 
2762     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2763 
2764     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2765       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2766       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2767       mmdata->mmBufferSize = mmBufferSize;
2768     }
2769 
2770   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0
2771     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2772   #endif
2773 
2774     mmdata->initialized = PETSC_TRUE;
2775   } else {
2776     /* to be safe, always update pointers of the mats */
2777     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2778     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2779     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2780   }
2781 
2782   /* do cusparseSpMM, which supports transpose on B */
2783   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2784 #else
2785   PetscInt k;
2786   /* cusparseXcsrmm does not support transpose on B */
2787   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2788     cublasHandle_t cublasv2handle;
2789     cublasStatus_t cerr;
2790 
2791     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2792     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2793     PetscCallCUBLAS(cerr);
2794     blda = B->cmap->n;
2795     k    = B->cmap->n;
2796   } else {
2797     k = B->rmap->n;
2798   }
2799 
2800   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2801   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2802   PetscCallCUSPARSE(stat);
2803 #endif
2804   PetscCall(PetscLogGpuTimeEnd());
2805   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2806   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2807   if (product->type == MATPRODUCT_RARt) {
2808     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2809     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2810   } else if (product->type == MATPRODUCT_PtAP) {
2811     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2812     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2813   } else {
2814     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2815   }
2816   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2817   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2818   PetscFunctionReturn(PETSC_SUCCESS);
2819 }
2820 
2821 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2822 {
2823   Mat_Product        *product = C->product;
2824   Mat                 A, B;
2825   PetscInt            m, n;
2826   PetscBool           cisdense, flg;
2827   MatMatCusparse     *mmdata;
2828   Mat_SeqAIJCUSPARSE *cusp;
2829 
2830   PetscFunctionBegin;
2831   MatCheckProduct(C, 1);
2832   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2833   A = product->A;
2834   B = product->B;
2835   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2836   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2837   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2838   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2839   switch (product->type) {
2840   case MATPRODUCT_AB:
2841     m = A->rmap->n;
2842     n = B->cmap->n;
2843     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2844     break;
2845   case MATPRODUCT_AtB:
2846     m = A->cmap->n;
2847     n = B->cmap->n;
2848     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2849     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2850     break;
2851   case MATPRODUCT_ABt:
2852     m = A->rmap->n;
2853     n = B->rmap->n;
2854     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2855     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2856     break;
2857   case MATPRODUCT_PtAP:
2858     m = B->cmap->n;
2859     n = B->cmap->n;
2860     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2861     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2862     break;
2863   case MATPRODUCT_RARt:
2864     m = B->rmap->n;
2865     n = B->rmap->n;
2866     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2867     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2868     break;
2869   default:
2870     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2871   }
2872   PetscCall(MatSetSizes(C, m, n, m, n));
2873   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2874   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2875   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2876 
2877   /* product data */
2878   PetscCall(PetscNew(&mmdata));
2879   mmdata->cisdense = cisdense;
2880 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2881   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2882   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2883 #endif
2884   /* for these products we need intermediate storage */
2885   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2886     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2887     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2888     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2889       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2890     } else {
2891       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2892     }
2893   }
2894   C->product->data    = mmdata;
2895   C->product->destroy = MatDestroy_MatMatCusparse;
2896 
2897   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2898   PetscFunctionReturn(PETSC_SUCCESS);
2899 }
2900 
2901 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2902 {
2903   Mat_Product                  *product = C->product;
2904   Mat                           A, B;
2905   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2906   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2907   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2908   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2909   PetscBool                     flg;
2910   cusparseStatus_t              stat;
2911   MatProductType                ptype;
2912   MatMatCusparse               *mmdata;
2913 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2914   cusparseSpMatDescr_t BmatSpDescr;
2915 #endif
2916   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2917 
2918   PetscFunctionBegin;
2919   MatCheckProduct(C, 1);
2920   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2921   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2922   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2923   mmdata = (MatMatCusparse *)C->product->data;
2924   A      = product->A;
2925   B      = product->B;
2926   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2927     mmdata->reusesym = PETSC_FALSE;
2928     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2929     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2930     Cmat = Ccusp->mat;
2931     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2932     Ccsr = (CsrMatrix *)Cmat->mat;
2933     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2934     goto finalize;
2935   }
2936   if (!c->nz) goto finalize;
2937   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2938   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2939   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2940   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2941   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2942   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2943   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2944   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2945   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2946   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2947   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2948   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2949   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2950   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2951 
2952   ptype = product->type;
2953   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2954     ptype = MATPRODUCT_AB;
2955     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2956   }
2957   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2958     ptype = MATPRODUCT_AB;
2959     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2960   }
2961   switch (ptype) {
2962   case MATPRODUCT_AB:
2963     Amat = Acusp->mat;
2964     Bmat = Bcusp->mat;
2965     break;
2966   case MATPRODUCT_AtB:
2967     Amat = Acusp->matTranspose;
2968     Bmat = Bcusp->mat;
2969     break;
2970   case MATPRODUCT_ABt:
2971     Amat = Acusp->mat;
2972     Bmat = Bcusp->matTranspose;
2973     break;
2974   default:
2975     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2976   }
2977   Cmat = Ccusp->mat;
2978   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2979   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2980   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2981   Acsr = (CsrMatrix *)Amat->mat;
2982   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2983   Ccsr = (CsrMatrix *)Cmat->mat;
2984   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2985   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2986   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2987   PetscCall(PetscLogGpuTimeBegin());
2988 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2989   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2990   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2991   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2992   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2993   PetscCallCUSPARSE(stat);
2994   #else
2995   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2996   PetscCallCUSPARSE(stat);
2997   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2998   PetscCallCUSPARSE(stat);
2999   #endif
3000 #else
3001   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3002                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3003   PetscCallCUSPARSE(stat);
3004 #endif
3005   PetscCall(PetscLogGpuFlops(mmdata->flops));
3006   PetscCallCUDA(WaitForCUDA());
3007   PetscCall(PetscLogGpuTimeEnd());
3008   C->offloadmask = PETSC_OFFLOAD_GPU;
3009 finalize:
3010   /* shorter version of MatAssemblyEnd_SeqAIJ */
3011   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3012   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3013   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3014   c->reallocs = 0;
3015   C->info.mallocs += 0;
3016   C->info.nz_unneeded = 0;
3017   C->assembled = C->was_assembled = PETSC_TRUE;
3018   C->num_ass++;
3019   PetscFunctionReturn(PETSC_SUCCESS);
3020 }
3021 
3022 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3023 {
3024   Mat_Product                  *product = C->product;
3025   Mat                           A, B;
3026   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3027   Mat_SeqAIJ                   *a, *b, *c;
3028   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3029   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3030   PetscInt                      i, j, m, n, k;
3031   PetscBool                     flg;
3032   cusparseStatus_t              stat;
3033   MatProductType                ptype;
3034   MatMatCusparse               *mmdata;
3035   PetscLogDouble                flops;
3036   PetscBool                     biscompressed, ciscompressed;
3037 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3038   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3039   cusparseSpMatDescr_t BmatSpDescr;
3040 #else
3041   int cnz;
3042 #endif
3043   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3044 
3045   PetscFunctionBegin;
3046   MatCheckProduct(C, 1);
3047   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3048   A = product->A;
3049   B = product->B;
3050   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3051   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3052   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3053   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3054   a = (Mat_SeqAIJ *)A->data;
3055   b = (Mat_SeqAIJ *)B->data;
3056   /* product data */
3057   PetscCall(PetscNew(&mmdata));
3058   C->product->data    = mmdata;
3059   C->product->destroy = MatDestroy_MatMatCusparse;
3060 
3061   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3062   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3063   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3064   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3065   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3066   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3067 
3068   ptype = product->type;
3069   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3070     ptype                                          = MATPRODUCT_AB;
3071     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3072   }
3073   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3074     ptype                                          = MATPRODUCT_AB;
3075     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3076   }
3077   biscompressed = PETSC_FALSE;
3078   ciscompressed = PETSC_FALSE;
3079   switch (ptype) {
3080   case MATPRODUCT_AB:
3081     m    = A->rmap->n;
3082     n    = B->cmap->n;
3083     k    = A->cmap->n;
3084     Amat = Acusp->mat;
3085     Bmat = Bcusp->mat;
3086     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3087     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3088     break;
3089   case MATPRODUCT_AtB:
3090     m = A->cmap->n;
3091     n = B->cmap->n;
3092     k = A->rmap->n;
3093     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3094     Amat = Acusp->matTranspose;
3095     Bmat = Bcusp->mat;
3096     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3097     break;
3098   case MATPRODUCT_ABt:
3099     m = A->rmap->n;
3100     n = B->rmap->n;
3101     k = A->cmap->n;
3102     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3103     Amat = Acusp->mat;
3104     Bmat = Bcusp->matTranspose;
3105     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3106     break;
3107   default:
3108     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3109   }
3110 
3111   /* create cusparse matrix */
3112   PetscCall(MatSetSizes(C, m, n, m, n));
3113   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3114   c     = (Mat_SeqAIJ *)C->data;
3115   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3116   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3117   Ccsr  = new CsrMatrix;
3118 
3119   c->compressedrow.use = ciscompressed;
3120   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3121     c->compressedrow.nrows = a->compressedrow.nrows;
3122     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3123     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3124     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3125     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3126     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3127   } else {
3128     c->compressedrow.nrows  = 0;
3129     c->compressedrow.i      = NULL;
3130     c->compressedrow.rindex = NULL;
3131     Ccusp->workVector       = NULL;
3132     Cmat->cprowIndices      = NULL;
3133   }
3134   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3135   Ccusp->mat        = Cmat;
3136   Ccusp->mat->mat   = Ccsr;
3137   Ccsr->num_rows    = Ccusp->nrows;
3138   Ccsr->num_cols    = n;
3139   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3140   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3141   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3142   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3143   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3144   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3145   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3146   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3147   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3148   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3149   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3150     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3151     c->nz                = 0;
3152     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3153     Ccsr->values         = new THRUSTARRAY(c->nz);
3154     goto finalizesym;
3155   }
3156 
3157   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3158   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3159   Acsr = (CsrMatrix *)Amat->mat;
3160   if (!biscompressed) {
3161     Bcsr = (CsrMatrix *)Bmat->mat;
3162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3163     BmatSpDescr = Bmat->matDescr;
3164 #endif
3165   } else { /* we need to use row offsets for the full matrix */
3166     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3167     Bcsr                 = new CsrMatrix;
3168     Bcsr->num_rows       = B->rmap->n;
3169     Bcsr->num_cols       = cBcsr->num_cols;
3170     Bcsr->num_entries    = cBcsr->num_entries;
3171     Bcsr->column_indices = cBcsr->column_indices;
3172     Bcsr->values         = cBcsr->values;
3173     if (!Bcusp->rowoffsets_gpu) {
3174       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3175       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3176       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3177     }
3178     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3179     mmdata->Bcsr      = Bcsr;
3180 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3181     if (Bcsr->num_rows && Bcsr->num_cols) {
3182       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3183       PetscCallCUSPARSE(stat);
3184     }
3185     BmatSpDescr = mmdata->matSpBDescr;
3186 #endif
3187   }
3188   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3189   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3190   /* precompute flops count */
3191   if (ptype == MATPRODUCT_AB) {
3192     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3193       const PetscInt st = a->i[i];
3194       const PetscInt en = a->i[i + 1];
3195       for (j = st; j < en; j++) {
3196         const PetscInt brow = a->j[j];
3197         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3198       }
3199     }
3200   } else if (ptype == MATPRODUCT_AtB) {
3201     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3202       const PetscInt anzi = a->i[i + 1] - a->i[i];
3203       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3204       flops += (2. * anzi) * bnzi;
3205     }
3206   } else { /* TODO */
3207     flops = 0.;
3208   }
3209 
3210   mmdata->flops = flops;
3211   PetscCall(PetscLogGpuTimeBegin());
3212 
3213 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3214   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3215   // cuda-12.2 requires non-null csrRowOffsets
3216   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3217   PetscCallCUSPARSE(stat);
3218   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3219   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3220   {
3221     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3222      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3223   */
3224     void *dBuffer1 = NULL;
3225     void *dBuffer2 = NULL;
3226     void *dBuffer3 = NULL;
3227     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3228     size_t bufferSize1 = 0;
3229     size_t bufferSize2 = 0;
3230     size_t bufferSize3 = 0;
3231     size_t bufferSize4 = 0;
3232     size_t bufferSize5 = 0;
3233 
3234     /* ask bufferSize1 bytes for external memory */
3235     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3236     PetscCallCUSPARSE(stat);
3237     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3238     /* inspect the matrices A and B to understand the memory requirement for the next step */
3239     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3240     PetscCallCUSPARSE(stat);
3241 
3242     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3243     PetscCallCUSPARSE(stat);
3244     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3245     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3246     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3247     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3248     PetscCallCUSPARSE(stat);
3249     PetscCallCUDA(cudaFree(dBuffer1));
3250     PetscCallCUDA(cudaFree(dBuffer2));
3251 
3252     /* get matrix C non-zero entries C_nnz1 */
3253     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3254     c->nz = (PetscInt)C_nnz1;
3255     /* allocate matrix C */
3256     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3257     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3258     Ccsr->values = new THRUSTARRAY(c->nz);
3259     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3260     /* update matC with the new pointers */
3261     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3262     PetscCallCUSPARSE(stat);
3263 
3264     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3265     PetscCallCUSPARSE(stat);
3266     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3267     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3268     PetscCallCUSPARSE(stat);
3269     PetscCallCUDA(cudaFree(dBuffer3));
3270     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3271     PetscCallCUSPARSE(stat);
3272     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3273   }
3274   #else
3275   size_t bufSize2;
3276   /* ask bufferSize bytes for external memory */
3277   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3278   PetscCallCUSPARSE(stat);
3279   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3280   /* inspect the matrices A and B to understand the memory requirement for the next step */
3281   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3282   PetscCallCUSPARSE(stat);
3283   /* ask bufferSize again bytes for external memory */
3284   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3285   PetscCallCUSPARSE(stat);
3286   /* The CUSPARSE documentation is not clear, nor the API
3287      We need both buffers to perform the operations properly!
3288      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3289      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3290      is stored in the descriptor! What a messy API... */
3291   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3292   /* compute the intermediate product of A * B */
3293   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3294   PetscCallCUSPARSE(stat);
3295   /* get matrix C non-zero entries C_nnz1 */
3296   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3297   c->nz = (PetscInt)C_nnz1;
3298   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3299                       mmdata->mmBufferSize / 1024));
3300   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3301   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3302   Ccsr->values = new THRUSTARRAY(c->nz);
3303   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3305   PetscCallCUSPARSE(stat);
3306   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3307   PetscCallCUSPARSE(stat);
3308   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3309 #else
3310   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3311   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3312                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3313   PetscCallCUSPARSE(stat);
3314   c->nz                = cnz;
3315   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3316   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3317   Ccsr->values = new THRUSTARRAY(c->nz);
3318   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319 
3320   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3321   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3322      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3323      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3324   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3325                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3326   PetscCallCUSPARSE(stat);
3327 #endif
3328   PetscCall(PetscLogGpuFlops(mmdata->flops));
3329   PetscCall(PetscLogGpuTimeEnd());
3330 finalizesym:
3331   c->free_a = PETSC_TRUE;
3332   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3333   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3334   c->free_ij = PETSC_TRUE;
3335   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3336     PetscInt      *d_i = c->i;
3337     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3338     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3339     ii = *Ccsr->row_offsets;
3340     jj = *Ccsr->column_indices;
3341     if (ciscompressed) d_i = c->compressedrow.i;
3342     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3343     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3344   } else {
3345     PetscInt *d_i = c->i;
3346     if (ciscompressed) d_i = c->compressedrow.i;
3347     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3348     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3349   }
3350   if (ciscompressed) { /* need to expand host row offsets */
3351     PetscInt r = 0;
3352     c->i[0]    = 0;
3353     for (k = 0; k < c->compressedrow.nrows; k++) {
3354       const PetscInt next = c->compressedrow.rindex[k];
3355       const PetscInt old  = c->compressedrow.i[k];
3356       for (; r < next; r++) c->i[r + 1] = old;
3357     }
3358     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3359   }
3360   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3361   PetscCall(PetscMalloc1(m, &c->ilen));
3362   PetscCall(PetscMalloc1(m, &c->imax));
3363   c->maxnz         = c->nz;
3364   c->nonzerorowcnt = 0;
3365   c->rmax          = 0;
3366   for (k = 0; k < m; k++) {
3367     const PetscInt nn = c->i[k + 1] - c->i[k];
3368     c->ilen[k] = c->imax[k] = nn;
3369     c->nonzerorowcnt += (PetscInt)!!nn;
3370     c->rmax = PetscMax(c->rmax, nn);
3371   }
3372   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3373   PetscCall(PetscMalloc1(c->nz, &c->a));
3374   Ccsr->num_entries = c->nz;
3375 
3376   C->nonzerostate++;
3377   PetscCall(PetscLayoutSetUp(C->rmap));
3378   PetscCall(PetscLayoutSetUp(C->cmap));
3379   Ccusp->nonzerostate = C->nonzerostate;
3380   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3381   C->preallocated     = PETSC_TRUE;
3382   C->assembled        = PETSC_FALSE;
3383   C->was_assembled    = PETSC_FALSE;
3384   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3385     mmdata->reusesym = PETSC_TRUE;
3386     C->offloadmask   = PETSC_OFFLOAD_GPU;
3387   }
3388   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3389   PetscFunctionReturn(PETSC_SUCCESS);
3390 }
3391 
3392 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3393 
3394 /* handles sparse or dense B */
3395 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3396 {
3397   Mat_Product *product = mat->product;
3398   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3399 
3400   PetscFunctionBegin;
3401   MatCheckProduct(mat, 1);
3402   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3403   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3404   if (product->type == MATPRODUCT_ABC) {
3405     Ciscusp = PETSC_FALSE;
3406     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3407   }
3408   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3409     PetscBool usecpu = PETSC_FALSE;
3410     switch (product->type) {
3411     case MATPRODUCT_AB:
3412       if (product->api_user) {
3413         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3414         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3415         PetscOptionsEnd();
3416       } else {
3417         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3418         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3419         PetscOptionsEnd();
3420       }
3421       break;
3422     case MATPRODUCT_AtB:
3423       if (product->api_user) {
3424         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3425         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3426         PetscOptionsEnd();
3427       } else {
3428         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3429         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3430         PetscOptionsEnd();
3431       }
3432       break;
3433     case MATPRODUCT_PtAP:
3434       if (product->api_user) {
3435         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3436         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3437         PetscOptionsEnd();
3438       } else {
3439         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3440         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3441         PetscOptionsEnd();
3442       }
3443       break;
3444     case MATPRODUCT_RARt:
3445       if (product->api_user) {
3446         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3447         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3448         PetscOptionsEnd();
3449       } else {
3450         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3451         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3452         PetscOptionsEnd();
3453       }
3454       break;
3455     case MATPRODUCT_ABC:
3456       if (product->api_user) {
3457         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3458         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3459         PetscOptionsEnd();
3460       } else {
3461         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3462         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3463         PetscOptionsEnd();
3464       }
3465       break;
3466     default:
3467       break;
3468     }
3469     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3470   }
3471   /* dispatch */
3472   if (isdense) {
3473     switch (product->type) {
3474     case MATPRODUCT_AB:
3475     case MATPRODUCT_AtB:
3476     case MATPRODUCT_ABt:
3477     case MATPRODUCT_PtAP:
3478     case MATPRODUCT_RARt:
3479       if (product->A->boundtocpu) {
3480         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3481       } else {
3482         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3483       }
3484       break;
3485     case MATPRODUCT_ABC:
3486       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3487       break;
3488     default:
3489       break;
3490     }
3491   } else if (Biscusp && Ciscusp) {
3492     switch (product->type) {
3493     case MATPRODUCT_AB:
3494     case MATPRODUCT_AtB:
3495     case MATPRODUCT_ABt:
3496       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3497       break;
3498     case MATPRODUCT_PtAP:
3499     case MATPRODUCT_RARt:
3500     case MATPRODUCT_ABC:
3501       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3502       break;
3503     default:
3504       break;
3505     }
3506   } else { /* fallback for AIJ */
3507     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3508   }
3509   PetscFunctionReturn(PETSC_SUCCESS);
3510 }
3511 
3512 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3513 {
3514   PetscFunctionBegin;
3515   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3516   PetscFunctionReturn(PETSC_SUCCESS);
3517 }
3518 
3519 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3520 {
3521   PetscFunctionBegin;
3522   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3523   PetscFunctionReturn(PETSC_SUCCESS);
3524 }
3525 
3526 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3527 {
3528   PetscFunctionBegin;
3529   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3530   PetscFunctionReturn(PETSC_SUCCESS);
3531 }
3532 
3533 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3534 {
3535   PetscFunctionBegin;
3536   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3537   PetscFunctionReturn(PETSC_SUCCESS);
3538 }
3539 
3540 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3541 {
3542   PetscFunctionBegin;
3543   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3544   PetscFunctionReturn(PETSC_SUCCESS);
3545 }
3546 
3547 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3548 {
3549   int i = blockIdx.x * blockDim.x + threadIdx.x;
3550   if (i < n) y[idx[i]] += x[i];
3551 }
3552 
3553 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3554 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3555 {
3556   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3557   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3558   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3559   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3560   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3561   PetscBool                     compressed;
3562 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3563   PetscInt nx, ny;
3564 #endif
3565 
3566   PetscFunctionBegin;
3567   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3568   if (!a->nz) {
3569     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3570     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3571     PetscFunctionReturn(PETSC_SUCCESS);
3572   }
3573   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3574   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3575   if (!trans) {
3576     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3577     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3578   } else {
3579     if (herm || !A->form_explicit_transpose) {
3580       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3581       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3582     } else {
3583       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3584       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3585     }
3586   }
3587   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3588   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3589 
3590   try {
3591     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3592     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3593     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3594 
3595     PetscCall(PetscLogGpuTimeBegin());
3596     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3597       /* z = A x + beta y.
3598          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3599          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3600       */
3601       xptr = xarray;
3602       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3603       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3605       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3606           allocated to accommodate different uses. So we get the length info directly from mat.
3607        */
3608       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3609         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3610         nx             = mat->num_cols; // since y = Ax
3611         ny             = mat->num_rows;
3612       }
3613 #endif
3614     } else {
3615       /* z = A^T x + beta y
3616          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3617          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3618        */
3619       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3620       dptr = zarray;
3621       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3622       if (compressed) { /* Scatter x to work vector */
3623         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3624 
3625         thrust::for_each(
3626 #if PetscDefined(HAVE_THRUST_ASYNC)
3627           thrust::cuda::par.on(PetscDefaultCudaStream),
3628 #endif
3629           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3630           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3631       }
3632 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3633       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3635         nx             = mat->num_rows; // since y = A^T x
3636         ny             = mat->num_cols;
3637       }
3638 #endif
3639     }
3640 
3641     /* csr_spmv does y = alpha op(A) x + beta y */
3642     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3643 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3644   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3645       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3646   #else
3647       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3648   #endif
3649 
3650       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3651   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3652       if (!matDescr) {
3653         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3654         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3655       }
3656   #endif
3657 
3658       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3659         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3660         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3661         PetscCallCUSPARSE(
3662           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3663         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3664   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3665         PetscCallCUSPARSE(
3666           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3667   #endif
3668         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3669       } else {
3670         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3671         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3672         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3673       }
3674 
3675       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3676 #else
3677       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3678       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3679 #endif
3680     } else {
3681       if (cusparsestruct->nrows) {
3682 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3683         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3684 #else
3685         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3686         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3687 #endif
3688       }
3689     }
3690     PetscCall(PetscLogGpuTimeEnd());
3691 
3692     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3693       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3694         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3695           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3696         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3697           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3698         }
3699       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3700         PetscCall(VecSeq_CUDA::Set(zz, 0));
3701       }
3702 
3703       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3704       if (compressed) {
3705         PetscCall(PetscLogGpuTimeBegin());
3706         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3707            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3708            prevent that. So I just add a ScatterAdd kernel.
3709          */
3710 #if 0
3711         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3712         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3713                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3714                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3715                          VecCUDAPlusEquals());
3716 #else
3717         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3718         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3719 #endif
3720         PetscCall(PetscLogGpuTimeEnd());
3721       }
3722     } else {
3723       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3724     }
3725     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3726     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3727     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3728   } catch (char *ex) {
3729     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3730   }
3731   if (yy) {
3732     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3733   } else {
3734     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3735   }
3736   PetscFunctionReturn(PETSC_SUCCESS);
3737 }
3738 
3739 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3740 {
3741   PetscFunctionBegin;
3742   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3743   PetscFunctionReturn(PETSC_SUCCESS);
3744 }
3745 
3746 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3747 {
3748   PetscFunctionBegin;
3749   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3750   PetscFunctionReturn(PETSC_SUCCESS);
3751 }
3752 
3753 /*@
3754   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3755   (the default parallel PETSc format).
3756 
3757   Collective
3758 
3759   Input Parameters:
3760 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3761 . m    - number of rows
3762 . n    - number of columns
3763 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3764 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3765 
3766   Output Parameter:
3767 . A - the matrix
3768 
3769   Level: intermediate
3770 
3771   Notes:
3772   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3773   calculations. For good matrix assembly performance the user should preallocate the matrix
3774   storage by setting the parameter `nz` (or the array `nnz`).
3775 
3776   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3777   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3778   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3779 
3780   The AIJ format, also called
3781   compressed row storage, is fully compatible with standard Fortran
3782   storage.  That is, the stored row and column indices can begin at
3783   either one (as in Fortran) or zero.
3784 
3785   Specify the preallocated storage with either nz or nnz (not both).
3786   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3787   allocation.
3788 
3789 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3790 @*/
3791 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3792 {
3793   PetscFunctionBegin;
3794   PetscCall(MatCreate(comm, A));
3795   PetscCall(MatSetSizes(*A, m, n, m, n));
3796   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3797   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3798   PetscFunctionReturn(PETSC_SUCCESS);
3799 }
3800 
3801 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3802 {
3803   PetscFunctionBegin;
3804   if (A->factortype == MAT_FACTOR_NONE) {
3805     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3806   } else {
3807     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3808   }
3809   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3810   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3811   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3812   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3813   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3814   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3815   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3816   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3817   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3818   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3819   PetscCall(MatDestroy_SeqAIJ(A));
3820   PetscFunctionReturn(PETSC_SUCCESS);
3821 }
3822 
3823 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3824 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3825 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3826 {
3827   PetscFunctionBegin;
3828   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3829   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3830   PetscFunctionReturn(PETSC_SUCCESS);
3831 }
3832 
3833 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3834 {
3835   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3836   Mat_SeqAIJCUSPARSE *cy;
3837   Mat_SeqAIJCUSPARSE *cx;
3838   PetscScalar        *ay;
3839   const PetscScalar  *ax;
3840   CsrMatrix          *csry, *csrx;
3841 
3842   PetscFunctionBegin;
3843   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3844   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3845   if (X->ops->axpy != Y->ops->axpy) {
3846     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3847     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3848     PetscFunctionReturn(PETSC_SUCCESS);
3849   }
3850   /* if we are here, it means both matrices are bound to GPU */
3851   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3852   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3853   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3854   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3855   csry = (CsrMatrix *)cy->mat->mat;
3856   csrx = (CsrMatrix *)cx->mat->mat;
3857   /* see if we can turn this into a cublas axpy */
3858   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3859     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3860     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3861     if (eq) str = SAME_NONZERO_PATTERN;
3862   }
3863   /* spgeam is buggy with one column */
3864   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3865 
3866   if (str == SUBSET_NONZERO_PATTERN) {
3867     PetscScalar b = 1.0;
3868 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3869     size_t bufferSize;
3870     void  *buffer;
3871 #endif
3872 
3873     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3874     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3875     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3876 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3877     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3878                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3879     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3880     PetscCall(PetscLogGpuTimeBegin());
3881     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3882                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3883     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3884     PetscCall(PetscLogGpuTimeEnd());
3885     PetscCallCUDA(cudaFree(buffer));
3886 #else
3887     PetscCall(PetscLogGpuTimeBegin());
3888     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3889                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3890     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3891     PetscCall(PetscLogGpuTimeEnd());
3892 #endif
3893     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3894     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3895     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3896     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3897   } else if (str == SAME_NONZERO_PATTERN) {
3898     cublasHandle_t cublasv2handle;
3899     PetscBLASInt   one = 1, bnz = 1;
3900 
3901     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3902     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3903     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3904     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3905     PetscCall(PetscLogGpuTimeBegin());
3906     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3907     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3908     PetscCall(PetscLogGpuTimeEnd());
3909     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3910     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3911     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3912   } else {
3913     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3914     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3915   }
3916   PetscFunctionReturn(PETSC_SUCCESS);
3917 }
3918 
3919 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3920 {
3921   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3922   PetscScalar   *ay;
3923   cublasHandle_t cublasv2handle;
3924   PetscBLASInt   one = 1, bnz = 1;
3925 
3926   PetscFunctionBegin;
3927   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3928   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3929   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3930   PetscCall(PetscLogGpuTimeBegin());
3931   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3932   PetscCall(PetscLogGpuFlops(bnz));
3933   PetscCall(PetscLogGpuTimeEnd());
3934   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3935   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3936   PetscFunctionReturn(PETSC_SUCCESS);
3937 }
3938 
3939 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3940 {
3941   PetscBool   both = PETSC_FALSE;
3942   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3943 
3944   PetscFunctionBegin;
3945   if (A->factortype == MAT_FACTOR_NONE) {
3946     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3947     if (spptr->mat) {
3948       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3949       if (matrix->values) {
3950         both = PETSC_TRUE;
3951         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3952       }
3953     }
3954     if (spptr->matTranspose) {
3955       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3956       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3957     }
3958   }
3959   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3960   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3961   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3962   else A->offloadmask = PETSC_OFFLOAD_CPU;
3963   PetscFunctionReturn(PETSC_SUCCESS);
3964 }
3965 
3966 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3967 {
3968   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3969 
3970   PetscFunctionBegin;
3971   if (A->factortype != MAT_FACTOR_NONE) {
3972     A->boundtocpu = flg;
3973     PetscFunctionReturn(PETSC_SUCCESS);
3974   }
3975   if (flg) {
3976     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3977 
3978     A->ops->scale                     = MatScale_SeqAIJ;
3979     A->ops->axpy                      = MatAXPY_SeqAIJ;
3980     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3981     A->ops->mult                      = MatMult_SeqAIJ;
3982     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3983     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3984     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3985     A->ops->multhermitiantranspose    = NULL;
3986     A->ops->multhermitiantransposeadd = NULL;
3987     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3988     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3989     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3990     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3991     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3992     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3993     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3994     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3995   } else {
3996     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3997     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3998     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3999     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4000     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4001     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4002     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4003     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4004     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4005     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4006     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4007     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4008     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4009     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4010     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4011     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4012     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4013 
4014     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4015     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4016     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4017     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4018     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4019     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4020   }
4021   A->boundtocpu = flg;
4022   if (flg && a->inode.size) {
4023     a->inode.use = PETSC_TRUE;
4024   } else {
4025     a->inode.use = PETSC_FALSE;
4026   }
4027   PetscFunctionReturn(PETSC_SUCCESS);
4028 }
4029 
4030 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4031 {
4032   Mat B;
4033 
4034   PetscFunctionBegin;
4035   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4036   if (reuse == MAT_INITIAL_MATRIX) {
4037     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4038   } else if (reuse == MAT_REUSE_MATRIX) {
4039     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4040   }
4041   B = *newmat;
4042 
4043   PetscCall(PetscFree(B->defaultvectype));
4044   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4045 
4046   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4047     if (B->factortype == MAT_FACTOR_NONE) {
4048       Mat_SeqAIJCUSPARSE *spptr;
4049       PetscCall(PetscNew(&spptr));
4050       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4051       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4052       spptr->format = MAT_CUSPARSE_CSR;
4053 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4054   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4055       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4056   #else
4057       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4058   #endif
4059       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4060       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4061 #endif
4062       B->spptr = spptr;
4063     } else {
4064       Mat_SeqAIJCUSPARSETriFactors *spptr;
4065 
4066       PetscCall(PetscNew(&spptr));
4067       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4068       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4069       B->spptr = spptr;
4070     }
4071     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4072   }
4073   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4074   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4075   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4076   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4077   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4078   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4079 
4080   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4081   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4082   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4083 #if defined(PETSC_HAVE_HYPRE)
4084   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4085 #endif
4086   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4087   PetscFunctionReturn(PETSC_SUCCESS);
4088 }
4089 
4090 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4091 {
4092   PetscFunctionBegin;
4093   PetscCall(MatCreate_SeqAIJ(B));
4094   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4095   PetscFunctionReturn(PETSC_SUCCESS);
4096 }
4097 
4098 /*MC
4099    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4100 
4101    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4102    CSR, ELL, or Hybrid format.
4103    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4104 
4105    Options Database Keys:
4106 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4107 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4108                                       Other options include ell (ellpack) or hyb (hybrid).
4109 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4110 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4111 
4112   Level: beginner
4113 
4114 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4115 M*/
4116 
4117 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4118 {
4119   PetscFunctionBegin;
4120   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4121   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4122   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4123   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4124   PetscFunctionReturn(PETSC_SUCCESS);
4125 }
4126 
4127 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4128 {
4129   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4130 
4131   PetscFunctionBegin;
4132   if (cusp) {
4133     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4134     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4135     delete cusp->workVector;
4136     delete cusp->rowoffsets_gpu;
4137     delete cusp->csr2csc_i;
4138     delete cusp->coords;
4139     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4140     PetscCall(PetscFree(mat->spptr));
4141   }
4142   PetscFunctionReturn(PETSC_SUCCESS);
4143 }
4144 
4145 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4146 {
4147   PetscFunctionBegin;
4148   if (*mat) {
4149     delete (*mat)->values;
4150     delete (*mat)->column_indices;
4151     delete (*mat)->row_offsets;
4152     delete *mat;
4153     *mat = 0;
4154   }
4155   PetscFunctionReturn(PETSC_SUCCESS);
4156 }
4157 
4158 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4159 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4160 {
4161   PetscFunctionBegin;
4162   if (*trifactor) {
4163     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4164     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4165     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4166     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4167     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4168   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4169     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4170   #endif
4171     PetscCall(PetscFree(*trifactor));
4172   }
4173   PetscFunctionReturn(PETSC_SUCCESS);
4174 }
4175 #endif
4176 
4177 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4178 {
4179   CsrMatrix *mat;
4180 
4181   PetscFunctionBegin;
4182   if (*matstruct) {
4183     if ((*matstruct)->mat) {
4184       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4185 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4186         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4187 #else
4188         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4189         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4190 #endif
4191       } else {
4192         mat = (CsrMatrix *)(*matstruct)->mat;
4193         PetscCall(CsrMatrix_Destroy(&mat));
4194       }
4195     }
4196     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4197     delete (*matstruct)->cprowIndices;
4198     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4199     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4200     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4201 
4202 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4203     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4204     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4205 
4206     for (int i = 0; i < 3; i++) {
4207       if (mdata->cuSpMV[i].initialized) {
4208         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4209         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4210         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4211   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4212         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4213         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4214   #endif
4215       }
4216     }
4217 #endif
4218     delete *matstruct;
4219     *matstruct = NULL;
4220   }
4221   PetscFunctionReturn(PETSC_SUCCESS);
4222 }
4223 
4224 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4225 {
4226   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4227 
4228   PetscFunctionBegin;
4229   if (fs) {
4230 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4231     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4232     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4233     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4234     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4235     delete fs->workVector;
4236     fs->workVector = NULL;
4237 #endif
4238     delete fs->rpermIndices;
4239     delete fs->cpermIndices;
4240     fs->rpermIndices  = NULL;
4241     fs->cpermIndices  = NULL;
4242     fs->init_dev_prop = PETSC_FALSE;
4243 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4244     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4245     PetscCallCUDA(cudaFree(fs->csrColIdx));
4246     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4247     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4248     PetscCallCUDA(cudaFree(fs->csrVal));
4249     PetscCallCUDA(cudaFree(fs->diag));
4250     PetscCallCUDA(cudaFree(fs->X));
4251     PetscCallCUDA(cudaFree(fs->Y));
4252     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4253     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4254     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4255     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4256     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4257     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4258     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4259     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4260     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4261     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4262     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4263     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4264     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4265     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4266     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4267     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4268     PetscCall(PetscFree(fs->csrRowPtr_h));
4269     PetscCall(PetscFree(fs->csrVal_h));
4270     PetscCall(PetscFree(fs->diag_h));
4271     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4272     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4273 #endif
4274   }
4275   PetscFunctionReturn(PETSC_SUCCESS);
4276 }
4277 
4278 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4279 {
4280   PetscFunctionBegin;
4281   if (*trifactors) {
4282     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4283     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4284     PetscCall(PetscFree(*trifactors));
4285   }
4286   PetscFunctionReturn(PETSC_SUCCESS);
4287 }
4288 
4289 struct IJCompare {
4290   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4291   {
4292     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4293     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4294     return false;
4295   }
4296 };
4297 
4298 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4299 {
4300   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4301 
4302   PetscFunctionBegin;
4303   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4304   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4305   if (destroy) {
4306     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4307     delete cusp->csr2csc_i;
4308     cusp->csr2csc_i = NULL;
4309   }
4310   A->transupdated = PETSC_FALSE;
4311   PetscFunctionReturn(PETSC_SUCCESS);
4312 }
4313 
4314 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4315 {
4316   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4317 
4318   PetscFunctionBegin;
4319   PetscCallCUDA(cudaFree(coo->perm));
4320   PetscCallCUDA(cudaFree(coo->jmap));
4321   PetscCall(PetscFree(coo));
4322   PetscFunctionReturn(PETSC_SUCCESS);
4323 }
4324 
4325 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4326 {
4327   PetscBool            dev_ij = PETSC_FALSE;
4328   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4329   PetscInt            *i, *j;
4330   PetscContainer       container_h;
4331   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4332 
4333   PetscFunctionBegin;
4334   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4335   PetscCall(PetscGetMemType(coo_i, &mtype));
4336   if (PetscMemTypeDevice(mtype)) {
4337     dev_ij = PETSC_TRUE;
4338     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4339     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4340     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4341   } else {
4342     i = coo_i;
4343     j = coo_j;
4344   }
4345 
4346   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4347   if (dev_ij) PetscCall(PetscFree2(i, j));
4348   mat->offloadmask = PETSC_OFFLOAD_CPU;
4349   // Create the GPU memory
4350   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4351 
4352   // Copy the COO struct to device
4353   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4354   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4355   PetscCall(PetscMalloc1(1, &coo_d));
4356   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4357   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4358   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4359   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4360   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4361 
4362   // Put the COO struct in a container and then attach that to the matrix
4363   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4364   PetscFunctionReturn(PETSC_SUCCESS);
4365 }
4366 
4367 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4368 {
4369   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4370   const PetscCount grid_size = gridDim.x * blockDim.x;
4371   for (; i < nnz; i += grid_size) {
4372     PetscScalar sum = 0.0;
4373     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4374     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4375   }
4376 }
4377 
4378 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4379 {
4380   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4381   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4382   PetscCount           Annz = seq->nz;
4383   PetscMemType         memtype;
4384   const PetscScalar   *v1 = v;
4385   PetscScalar         *Aa;
4386   PetscContainer       container;
4387   MatCOOStruct_SeqAIJ *coo;
4388 
4389   PetscFunctionBegin;
4390   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4391 
4392   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4393   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4394 
4395   PetscCall(PetscGetMemType(v, &memtype));
4396   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4397     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4398     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4399   }
4400 
4401   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4402   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4403 
4404   PetscCall(PetscLogGpuTimeBegin());
4405   if (Annz) {
4406     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4407     PetscCallCUDA(cudaPeekAtLastError());
4408   }
4409   PetscCall(PetscLogGpuTimeEnd());
4410 
4411   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4412   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4413 
4414   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4415   PetscFunctionReturn(PETSC_SUCCESS);
4416 }
4417 
4418 /*@C
4419   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4420 
4421   Not Collective
4422 
4423   Input Parameters:
4424 + A          - the matrix
4425 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4426 
4427   Output Parameters:
4428 + i - the CSR row pointers
4429 - j - the CSR column indices
4430 
4431   Level: developer
4432 
4433   Note:
4434   When compressed is true, the CSR structure does not contain empty rows
4435 
4436 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4437 @*/
4438 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4439 {
4440   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4441   CsrMatrix          *csr;
4442   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4443 
4444   PetscFunctionBegin;
4445   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4446   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4447   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4448   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4449   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4450   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4451   csr = (CsrMatrix *)cusp->mat->mat;
4452   if (i) {
4453     if (!compressed && a->compressedrow.use) { /* need full row offset */
4454       if (!cusp->rowoffsets_gpu) {
4455         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4456         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4457         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4458       }
4459       *i = cusp->rowoffsets_gpu->data().get();
4460     } else *i = csr->row_offsets->data().get();
4461   }
4462   if (j) *j = csr->column_indices->data().get();
4463   PetscFunctionReturn(PETSC_SUCCESS);
4464 }
4465 
4466 /*@C
4467   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4468 
4469   Not Collective
4470 
4471   Input Parameters:
4472 + A          - the matrix
4473 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4474 . i          - the CSR row pointers
4475 - j          - the CSR column indices
4476 
4477   Level: developer
4478 
4479 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4480 @*/
4481 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4482 {
4483   PetscFunctionBegin;
4484   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4485   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4486   if (i) *i = NULL;
4487   if (j) *j = NULL;
4488   (void)compressed;
4489   PetscFunctionReturn(PETSC_SUCCESS);
4490 }
4491 
4492 /*@C
4493   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4494 
4495   Not Collective
4496 
4497   Input Parameter:
4498 . A - a `MATSEQAIJCUSPARSE` matrix
4499 
4500   Output Parameter:
4501 . a - pointer to the device data
4502 
4503   Level: developer
4504 
4505   Note:
4506   May trigger host-device copies if up-to-date matrix data is on host
4507 
4508 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4509 @*/
4510 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4511 {
4512   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4513   CsrMatrix          *csr;
4514 
4515   PetscFunctionBegin;
4516   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4517   PetscAssertPointer(a, 2);
4518   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4519   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4520   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4521   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4522   csr = (CsrMatrix *)cusp->mat->mat;
4523   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4524   *a = csr->values->data().get();
4525   PetscFunctionReturn(PETSC_SUCCESS);
4526 }
4527 
4528 /*@C
4529   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4530 
4531   Not Collective
4532 
4533   Input Parameters:
4534 + A - a `MATSEQAIJCUSPARSE` matrix
4535 - a - pointer to the device data
4536 
4537   Level: developer
4538 
4539 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4540 @*/
4541 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4542 {
4543   PetscFunctionBegin;
4544   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4545   PetscAssertPointer(a, 2);
4546   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4547   *a = NULL;
4548   PetscFunctionReturn(PETSC_SUCCESS);
4549 }
4550 
4551 /*@C
4552   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4553 
4554   Not Collective
4555 
4556   Input Parameter:
4557 . A - a `MATSEQAIJCUSPARSE` matrix
4558 
4559   Output Parameter:
4560 . a - pointer to the device data
4561 
4562   Level: developer
4563 
4564   Note:
4565   May trigger host-device copies if up-to-date matrix data is on host
4566 
4567 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4568 @*/
4569 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4570 {
4571   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4572   CsrMatrix          *csr;
4573 
4574   PetscFunctionBegin;
4575   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4576   PetscAssertPointer(a, 2);
4577   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4578   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4579   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4580   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4581   csr = (CsrMatrix *)cusp->mat->mat;
4582   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4583   *a             = csr->values->data().get();
4584   A->offloadmask = PETSC_OFFLOAD_GPU;
4585   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4586   PetscFunctionReturn(PETSC_SUCCESS);
4587 }
4588 /*@C
4589   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4590 
4591   Not Collective
4592 
4593   Input Parameters:
4594 + A - a `MATSEQAIJCUSPARSE` matrix
4595 - a - pointer to the device data
4596 
4597   Level: developer
4598 
4599 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4600 @*/
4601 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4602 {
4603   PetscFunctionBegin;
4604   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4605   PetscAssertPointer(a, 2);
4606   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4607   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4608   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4609   *a = NULL;
4610   PetscFunctionReturn(PETSC_SUCCESS);
4611 }
4612 
4613 /*@C
4614   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4615 
4616   Not Collective
4617 
4618   Input Parameter:
4619 . A - a `MATSEQAIJCUSPARSE` matrix
4620 
4621   Output Parameter:
4622 . a - pointer to the device data
4623 
4624   Level: developer
4625 
4626   Note:
4627   Does not trigger host-device copies and flags data validity on the GPU
4628 
4629 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4630 @*/
4631 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4632 {
4633   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4634   CsrMatrix          *csr;
4635 
4636   PetscFunctionBegin;
4637   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4638   PetscAssertPointer(a, 2);
4639   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4640   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4641   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4642   csr = (CsrMatrix *)cusp->mat->mat;
4643   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4644   *a             = csr->values->data().get();
4645   A->offloadmask = PETSC_OFFLOAD_GPU;
4646   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4647   PetscFunctionReturn(PETSC_SUCCESS);
4648 }
4649 
4650 /*@C
4651   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4652 
4653   Not Collective
4654 
4655   Input Parameters:
4656 + A - a `MATSEQAIJCUSPARSE` matrix
4657 - a - pointer to the device data
4658 
4659   Level: developer
4660 
4661 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4662 @*/
4663 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4664 {
4665   PetscFunctionBegin;
4666   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4667   PetscAssertPointer(a, 2);
4668   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4669   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4670   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4671   *a = NULL;
4672   PetscFunctionReturn(PETSC_SUCCESS);
4673 }
4674 
4675 struct IJCompare4 {
4676   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4677   {
4678     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4679     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4680     return false;
4681   }
4682 };
4683 
4684 struct Shift {
4685   int _shift;
4686 
4687   Shift(int shift) : _shift(shift) { }
4688   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4689 };
4690 
4691 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4692 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4693 {
4694   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4695   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4696   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4697   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4698   PetscInt                      Annz, Bnnz;
4699   cusparseStatus_t              stat;
4700   PetscInt                      i, m, n, zero = 0;
4701 
4702   PetscFunctionBegin;
4703   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4704   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4705   PetscAssertPointer(C, 4);
4706   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4707   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4708   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4709   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4710   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4711   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4712   if (reuse == MAT_INITIAL_MATRIX) {
4713     m = A->rmap->n;
4714     n = A->cmap->n + B->cmap->n;
4715     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4716     PetscCall(MatSetSizes(*C, m, n, m, n));
4717     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4718     c                       = (Mat_SeqAIJ *)(*C)->data;
4719     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4720     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4721     Ccsr                    = new CsrMatrix;
4722     Cmat->cprowIndices      = NULL;
4723     c->compressedrow.use    = PETSC_FALSE;
4724     c->compressedrow.nrows  = 0;
4725     c->compressedrow.i      = NULL;
4726     c->compressedrow.rindex = NULL;
4727     Ccusp->workVector       = NULL;
4728     Ccusp->nrows            = m;
4729     Ccusp->mat              = Cmat;
4730     Ccusp->mat->mat         = Ccsr;
4731     Ccsr->num_rows          = m;
4732     Ccsr->num_cols          = n;
4733     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4734     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4735     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4736     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4737     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4738     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4739     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4740     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4741     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4742     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4743     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4744     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4745     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4746 
4747     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4748     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4749     Annz                 = (PetscInt)Acsr->column_indices->size();
4750     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4751     c->nz                = Annz + Bnnz;
4752     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4753     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4754     Ccsr->values         = new THRUSTARRAY(c->nz);
4755     Ccsr->num_entries    = c->nz;
4756     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4757     if (c->nz) {
4758       auto              Acoo = new THRUSTINTARRAY32(Annz);
4759       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4760       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4761       THRUSTINTARRAY32 *Aroff, *Broff;
4762 
4763       if (a->compressedrow.use) { /* need full row offset */
4764         if (!Acusp->rowoffsets_gpu) {
4765           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4766           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4767           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4768         }
4769         Aroff = Acusp->rowoffsets_gpu;
4770       } else Aroff = Acsr->row_offsets;
4771       if (b->compressedrow.use) { /* need full row offset */
4772         if (!Bcusp->rowoffsets_gpu) {
4773           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4774           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4775           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4776         }
4777         Broff = Bcusp->rowoffsets_gpu;
4778       } else Broff = Bcsr->row_offsets;
4779       PetscCall(PetscLogGpuTimeBegin());
4780       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4781       PetscCallCUSPARSE(stat);
4782       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4783       PetscCallCUSPARSE(stat);
4784       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4785       auto Aperm = thrust::make_constant_iterator(1);
4786       auto Bperm = thrust::make_constant_iterator(0);
4787 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4788       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4789       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4790 #else
4791       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4792       auto Bcib = Bcsr->column_indices->begin();
4793       auto Bcie = Bcsr->column_indices->end();
4794       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4795 #endif
4796       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4797       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4798       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4799       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4800       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4801       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4802       auto p1    = Ccusp->coords->begin();
4803       auto p2    = Ccusp->coords->begin();
4804       thrust::advance(p2, Annz);
4805       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4806 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4807       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4808 #endif
4809       auto cci = thrust::make_counting_iterator(zero);
4810       auto cce = thrust::make_counting_iterator(c->nz);
4811 #if 0 //Errors on SUMMIT cuda 11.1.0
4812       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4813 #else
4814       auto pred = thrust::identity<int>();
4815       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4816       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4817 #endif
4818       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4819       PetscCallCUSPARSE(stat);
4820       PetscCall(PetscLogGpuTimeEnd());
4821       delete wPerm;
4822       delete Acoo;
4823       delete Bcoo;
4824       delete Ccoo;
4825 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4826       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4827       PetscCallCUSPARSE(stat);
4828 #endif
4829       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4830         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4831         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4832         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4833         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4834         CsrMatrix                    *CcsrT = new CsrMatrix;
4835         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4836         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4837 
4838         (*C)->form_explicit_transpose = PETSC_TRUE;
4839         (*C)->transupdated            = PETSC_TRUE;
4840         Ccusp->rowoffsets_gpu         = NULL;
4841         CmatT->cprowIndices           = NULL;
4842         CmatT->mat                    = CcsrT;
4843         CcsrT->num_rows               = n;
4844         CcsrT->num_cols               = m;
4845         CcsrT->num_entries            = c->nz;
4846 
4847         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4848         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4849         CcsrT->values         = new THRUSTARRAY(c->nz);
4850 
4851         PetscCall(PetscLogGpuTimeBegin());
4852         auto rT = CcsrT->row_offsets->begin();
4853         if (AT) {
4854           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4855           thrust::advance(rT, -1);
4856         }
4857         if (BT) {
4858           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4859           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4860           thrust::copy(titb, tite, rT);
4861         }
4862         auto cT = CcsrT->column_indices->begin();
4863         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4864         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4865         auto vT = CcsrT->values->begin();
4866         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4867         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4868         PetscCall(PetscLogGpuTimeEnd());
4869 
4870         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4871         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4872         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4873         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4874         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4875         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4876         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4877         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4878         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4879 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4880         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4881         PetscCallCUSPARSE(stat);
4882 #endif
4883         Ccusp->matTranspose = CmatT;
4884       }
4885     }
4886 
4887     c->free_a = PETSC_TRUE;
4888     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4889     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4890     c->free_ij = PETSC_TRUE;
4891     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4892       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4893       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4894       ii = *Ccsr->row_offsets;
4895       jj = *Ccsr->column_indices;
4896       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4897       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4898     } else {
4899       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4900       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4901     }
4902     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4903     PetscCall(PetscMalloc1(m, &c->ilen));
4904     PetscCall(PetscMalloc1(m, &c->imax));
4905     c->maxnz         = c->nz;
4906     c->nonzerorowcnt = 0;
4907     c->rmax          = 0;
4908     for (i = 0; i < m; i++) {
4909       const PetscInt nn = c->i[i + 1] - c->i[i];
4910       c->ilen[i] = c->imax[i] = nn;
4911       c->nonzerorowcnt += (PetscInt)!!nn;
4912       c->rmax = PetscMax(c->rmax, nn);
4913     }
4914     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4915     PetscCall(PetscMalloc1(c->nz, &c->a));
4916     (*C)->nonzerostate++;
4917     PetscCall(PetscLayoutSetUp((*C)->rmap));
4918     PetscCall(PetscLayoutSetUp((*C)->cmap));
4919     Ccusp->nonzerostate = (*C)->nonzerostate;
4920     (*C)->preallocated  = PETSC_TRUE;
4921   } else {
4922     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4923     c = (Mat_SeqAIJ *)(*C)->data;
4924     if (c->nz) {
4925       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4926       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4927       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4928       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4929       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4930       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4931       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4932       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4933       Acsr = (CsrMatrix *)Acusp->mat->mat;
4934       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4935       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4936       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4937       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4938       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4939       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4940       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4941       auto pmid = Ccusp->coords->begin();
4942       thrust::advance(pmid, Acsr->num_entries);
4943       PetscCall(PetscLogGpuTimeBegin());
4944       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4945       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4946       thrust::for_each(zibait, zieait, VecCUDAEquals());
4947       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4948       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4949       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4950       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4951       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4952         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4953         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4954         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4955         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4956         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4957         auto       vT    = CcsrT->values->begin();
4958         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4959         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4960         (*C)->transupdated = PETSC_TRUE;
4961       }
4962       PetscCall(PetscLogGpuTimeEnd());
4963     }
4964   }
4965   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4966   (*C)->assembled     = PETSC_TRUE;
4967   (*C)->was_assembled = PETSC_FALSE;
4968   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4969   PetscFunctionReturn(PETSC_SUCCESS);
4970 }
4971 
4972 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4973 {
4974   bool               dmem;
4975   const PetscScalar *av;
4976 
4977   PetscFunctionBegin;
4978   dmem = isCudaMem(v);
4979   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4980   if (n && idx) {
4981     THRUSTINTARRAY widx(n);
4982     widx.assign(idx, idx + n);
4983     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4984 
4985     THRUSTARRAY                    *w = NULL;
4986     thrust::device_ptr<PetscScalar> dv;
4987     if (dmem) {
4988       dv = thrust::device_pointer_cast(v);
4989     } else {
4990       w  = new THRUSTARRAY(n);
4991       dv = w->data();
4992     }
4993     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4994 
4995     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4996     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4997     thrust::for_each(zibit, zieit, VecCUDAEquals());
4998     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4999     delete w;
5000   } else {
5001     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5002   }
5003   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5004   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5005   PetscFunctionReturn(PETSC_SUCCESS);
5006 }
5007 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
5008