xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision f9334340f3fab3703ba64cae52af023ff47a74d4)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73 #endif
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
89 
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
92 
93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96 
97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98 {
99   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
100 
101   PetscFunctionBegin;
102   switch (op) {
103   case MAT_CUSPARSE_MULT:
104     cusparsestruct->format = format;
105     break;
106   case MAT_CUSPARSE_ALL:
107     cusparsestruct->format = format;
108     break;
109   default:
110     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111   }
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
115 /*@
116   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
117   operation. Only the `MatMult()` operation can use different GPU storage formats
118 
119   Not Collective
120 
121   Input Parameters:
122 + A      - Matrix of type `MATSEQAIJCUSPARSE`
123 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
124         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126 
127   Level: intermediate
128 
129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130 @*/
131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132 {
133   PetscFunctionBegin;
134   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140 {
141   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142 
143   PetscFunctionBegin;
144   cusparsestruct->use_cpu_solve = use_cpu;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150 
151   Input Parameters:
152 + A       - Matrix of type `MATSEQAIJCUSPARSE`
153 - use_cpu - set flag for using the built-in CPU `MatSolve()`
154 
155   Level: intermediate
156 
157   Note:
158   The cuSparse LU solver currently computes the factors with the built-in CPU method
159   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161 
162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163 @*/
164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165 {
166   PetscFunctionBegin;
167   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173 {
174   PetscFunctionBegin;
175   switch (op) {
176   case MAT_FORM_EXPLICIT_TRANSPOSE:
177     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
178     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
179     A->form_explicit_transpose = flg;
180     break;
181   default:
182     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183     break;
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
189 {
190   MatCUSPARSEStorageFormat format;
191   PetscBool                flg;
192   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
193 
194   PetscFunctionBegin;
195   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
196   if (A->factortype == MAT_FACTOR_NONE) {
197     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
198     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199 
200     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
201     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
202     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
203     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
205     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209   #else
210     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211   #endif
212     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214 
215     PetscCall(
216       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218 #endif
219   }
220   PetscOptionsHeadEnd();
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226 {
227   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228   PetscInt                      m  = A->rmap->n;
229   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231   const MatScalar              *Aa = a->a;
232   PetscInt                     *Mi, *Mj, Mnz;
233   PetscScalar                  *Ma;
234 
235   PetscFunctionBegin;
236   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240       PetscCall(PetscMalloc1(m + 1, &Mi));
241       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242       PetscCall(PetscMalloc1(Mnz, &Ma));
243       Mi[0] = 0;
244       for (PetscInt i = 0; i < m; i++) {
245         PetscInt llen = Ai[i + 1] - Ai[i];
246         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250         Mi[i + 1] = Mi[i] + llen + ulen;
251       }
252       // Copy M (L,U) from host to device
253       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258 
259       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267 
268       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271 
272       fillMode = CUSPARSE_FILL_MODE_UPPER;
273       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277 
278       // Allocate work vectors in SpSv
279       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281 
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284 
285       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292 
293       // Record for reuse
294       fs->csrRowPtr_h = Mi;
295       fs->csrVal_h    = Ma;
296       PetscCall(PetscFree(Mj));
297     }
298     // Copy the value
299     Mi  = fs->csrRowPtr_h;
300     Ma  = fs->csrVal_h;
301     Mnz = Mi[m];
302     for (PetscInt i = 0; i < m; i++) {
303       PetscInt llen = Ai[i + 1] - Ai[i];
304       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308     }
309     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310 
311     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
312     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
313 
314     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
315 
316     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
317     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
318   }
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 #else
322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
323 {
324   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
325   PetscInt                           n                  = A->rmap->n;
326   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
327   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
328   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
329   const MatScalar                   *aa = a->a, *v;
330   PetscInt                          *AiLo, *AjLo;
331   PetscInt                           i, nz, nzLower, offset, rowOffset;
332 
333   PetscFunctionBegin;
334   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
335   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
336     try {
337       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
338       nzLower = n + ai[n] - ai[1];
339       if (!loTriFactor) {
340         PetscScalar *AALo;
341 
342         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
343 
344         /* Allocate Space for the lower triangular matrix */
345         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
346         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
347 
348         /* Fill the lower triangular matrix */
349         AiLo[0]   = (PetscInt)0;
350         AiLo[n]   = nzLower;
351         AjLo[0]   = (PetscInt)0;
352         AALo[0]   = (MatScalar)1.0;
353         v         = aa;
354         vi        = aj;
355         offset    = 1;
356         rowOffset = 1;
357         for (i = 1; i < n; i++) {
358           nz = ai[i + 1] - ai[i];
359           /* additional 1 for the term on the diagonal */
360           AiLo[i] = rowOffset;
361           rowOffset += nz + 1;
362 
363           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
364           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
365 
366           offset += nz;
367           AjLo[offset] = (PetscInt)i;
368           AALo[offset] = (MatScalar)1.0;
369           offset += 1;
370 
371           v += nz;
372           vi += nz;
373         }
374 
375         /* allocate space for the triangular factor information */
376         PetscCall(PetscNew(&loTriFactor));
377         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
378         /* Create the matrix description */
379         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
380         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
381   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
382         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
383   #else
384         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
385   #endif
386         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
387         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
388 
389         /* set the operation */
390         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
391 
392         /* set the matrix */
393         loTriFactor->csrMat              = new CsrMatrix;
394         loTriFactor->csrMat->num_rows    = n;
395         loTriFactor->csrMat->num_cols    = n;
396         loTriFactor->csrMat->num_entries = nzLower;
397 
398         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
399         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
400 
401         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
402         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
403 
404         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
405         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
406 
407         /* Create the solve analysis information */
408         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
409         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
410   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
411         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
412                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
413         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
414   #endif
415 
416         /* perform the solve analysis */
417         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
418                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
419         PetscCallCUDA(WaitForCUDA());
420         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
421 
422         /* assign the pointer */
423         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
424         loTriFactor->AA_h                                          = AALo;
425         PetscCallCUDA(cudaFreeHost(AiLo));
426         PetscCallCUDA(cudaFreeHost(AjLo));
427         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
428       } else { /* update values only */
429         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
430         /* Fill the lower triangular matrix */
431         loTriFactor->AA_h[0] = 1.0;
432         v                    = aa;
433         vi                   = aj;
434         offset               = 1;
435         for (i = 1; i < n; i++) {
436           nz = ai[i + 1] - ai[i];
437           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
438           offset += nz;
439           loTriFactor->AA_h[offset] = 1.0;
440           offset += 1;
441           v += nz;
442         }
443         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
444         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
445       }
446     } catch (char *ex) {
447       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
448     }
449   }
450   PetscFunctionReturn(PETSC_SUCCESS);
451 }
452 
453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
454 {
455   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
456   PetscInt                           n                  = A->rmap->n;
457   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
458   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
459   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
460   const MatScalar                   *aa = a->a, *v;
461   PetscInt                          *AiUp, *AjUp;
462   PetscInt                           i, nz, nzUpper, offset;
463 
464   PetscFunctionBegin;
465   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
466   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
467     try {
468       /* next, figure out the number of nonzeros in the upper triangular matrix. */
469       nzUpper = adiag[0] - adiag[n];
470       if (!upTriFactor) {
471         PetscScalar *AAUp;
472 
473         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
474 
475         /* Allocate Space for the upper triangular matrix */
476         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
477         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
478 
479         /* Fill the upper triangular matrix */
480         AiUp[0] = (PetscInt)0;
481         AiUp[n] = nzUpper;
482         offset  = nzUpper;
483         for (i = n - 1; i >= 0; i--) {
484           v  = aa + adiag[i + 1] + 1;
485           vi = aj + adiag[i + 1] + 1;
486 
487           /* number of elements NOT on the diagonal */
488           nz = adiag[i] - adiag[i + 1] - 1;
489 
490           /* decrement the offset */
491           offset -= (nz + 1);
492 
493           /* first, set the diagonal elements */
494           AjUp[offset] = (PetscInt)i;
495           AAUp[offset] = (MatScalar)1. / v[nz];
496           AiUp[i]      = AiUp[i + 1] - (nz + 1);
497 
498           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
499           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
500         }
501 
502         /* allocate space for the triangular factor information */
503         PetscCall(PetscNew(&upTriFactor));
504         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
505 
506         /* Create the matrix description */
507         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
508         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
509   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
510         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
511   #else
512         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
513   #endif
514         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
515         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516 
517         /* set the operation */
518         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519 
520         /* set the matrix */
521         upTriFactor->csrMat              = new CsrMatrix;
522         upTriFactor->csrMat->num_rows    = n;
523         upTriFactor->csrMat->num_cols    = n;
524         upTriFactor->csrMat->num_entries = nzUpper;
525 
526         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
527         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
528 
529         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
530         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
531 
532         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
533         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
534 
535         /* Create the solve analysis information */
536         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
537         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
538   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
539         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
540                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
541         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
542   #endif
543 
544         /* perform the solve analysis */
545         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
546                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
547 
548         PetscCallCUDA(WaitForCUDA());
549         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
550 
551         /* assign the pointer */
552         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
553         upTriFactor->AA_h                                          = AAUp;
554         PetscCallCUDA(cudaFreeHost(AiUp));
555         PetscCallCUDA(cudaFreeHost(AjUp));
556         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
557       } else {
558         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
559         /* Fill the upper triangular matrix */
560         offset = nzUpper;
561         for (i = n - 1; i >= 0; i--) {
562           v = aa + adiag[i + 1] + 1;
563 
564           /* number of elements NOT on the diagonal */
565           nz = adiag[i] - adiag[i + 1] - 1;
566 
567           /* decrement the offset */
568           offset -= (nz + 1);
569 
570           /* first, set the diagonal elements */
571           upTriFactor->AA_h[offset] = 1. / v[nz];
572           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
573         }
574         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
575         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
576       }
577     } catch (char *ex) {
578       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
579     }
580   }
581   PetscFunctionReturn(PETSC_SUCCESS);
582 }
583 #endif
584 
585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
586 {
587   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
588   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
589   IS                            isrow = a->row, iscol = a->icol;
590   PetscBool                     row_identity, col_identity;
591   PetscInt                      n = A->rmap->n;
592 
593   PetscFunctionBegin;
594   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
596   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
597 #else
598   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
599   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
600   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
601 #endif
602 
603   cusparseTriFactors->nnz = a->nz;
604 
605   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
606   /* lower triangular indices */
607   PetscCall(ISIdentity(isrow, &row_identity));
608   if (!row_identity && !cusparseTriFactors->rpermIndices) {
609     const PetscInt *r;
610 
611     PetscCall(ISGetIndices(isrow, &r));
612     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
613     cusparseTriFactors->rpermIndices->assign(r, r + n);
614     PetscCall(ISRestoreIndices(isrow, &r));
615     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
616   }
617 
618   /* upper triangular indices */
619   PetscCall(ISIdentity(iscol, &col_identity));
620   if (!col_identity && !cusparseTriFactors->cpermIndices) {
621     const PetscInt *c;
622 
623     PetscCall(ISGetIndices(iscol, &c));
624     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
625     cusparseTriFactors->cpermIndices->assign(c, c + n);
626     PetscCall(ISRestoreIndices(iscol, &c));
627     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
628   }
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
634 {
635   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
636   PetscInt                      m  = A->rmap->n;
637   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
638   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
639   const MatScalar              *Aa = a->a;
640   PetscInt                     *Mj, Mnz;
641   PetscScalar                  *Ma, *D;
642 
643   PetscFunctionBegin;
644   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
645     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
646       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
647       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
648       Mnz = Ai[m]; // Unz (with the unit diagonal)
649       PetscCall(PetscMalloc1(Mnz, &Ma));
650       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
651       PetscCall(PetscMalloc1(m, &D));    // the diagonal
652       for (PetscInt i = 0; i < m; i++) {
653         PetscInt ulen = Ai[i + 1] - Ai[i];
654         Mj[Ai[i]]     = i;                                              // diagonal entry
655         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
656       }
657       // Copy M (U) from host to device
658       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
659       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
661       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
662       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
663       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
664 
665       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
666       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
667       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
668       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
669       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
670       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
671       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
672       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
673 
674       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
676       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
677 
678       // Allocate work vectors in SpSv
679       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
680       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
681 
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
683       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
684 
685       // Query buffer sizes for SpSV and then allocate buffers
686       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
687       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
688       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
689 
690       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
691       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
692       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
693 
694       // Record for reuse
695       fs->csrVal_h = Ma;
696       fs->diag_h   = D;
697       PetscCall(PetscFree(Mj));
698     }
699     // Copy the value
700     Ma  = fs->csrVal_h;
701     D   = fs->diag_h;
702     Mnz = Ai[m];
703     for (PetscInt i = 0; i < m; i++) {
704       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
705       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
706       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
707     }
708     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
709     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
710 
711     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
713     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
714   }
715   PetscFunctionReturn(PETSC_SUCCESS);
716 }
717 
718 // Solve Ut D U x = b
719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
720 {
721   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
722   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
723   const PetscScalar                    *barray;
724   PetscScalar                          *xarray;
725   thrust::device_ptr<const PetscScalar> bGPU;
726   thrust::device_ptr<PetscScalar>       xGPU;
727   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
728   PetscInt                              m   = A->rmap->n;
729 
730   PetscFunctionBegin;
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
733   PetscCall(VecCUDAGetArrayRead(b, &barray));
734   xGPU = thrust::device_pointer_cast(xarray);
735   bGPU = thrust::device_pointer_cast(barray);
736 
737   // Reorder b with the row permutation if needed, and wrap the result in fs->X
738   if (fs->rpermIndices) {
739     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
740     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
741   } else {
742     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
743   }
744 
745   // Solve Ut Y = X
746   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
747   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
748 
749   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
750   // It is basically a vector element-wise multiplication, but cublas does not have it!
751   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
752 
753   // Solve U X = Y
754   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756   } else {
757     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758   }
759   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
760 
761   // Reorder X with the column permutation if needed, and put the result back to x
762   if (fs->cpermIndices) {
763     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765   }
766 
767   PetscCall(VecCUDARestoreArrayRead(b, &barray));
768   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769   PetscCall(PetscLogGpuTimeEnd());
770   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 #else
774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775 {
776   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
777   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780   PetscInt                          *AiUp, *AjUp;
781   PetscScalar                       *AAUp;
782   PetscScalar                       *AALo;
783   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
785   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
786   const MatScalar                   *aa = b->a, *v;
787 
788   PetscFunctionBegin;
789   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791     try {
792       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
793       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794       if (!upTriFactor && !loTriFactor) {
795         /* Allocate Space for the upper triangular matrix */
796         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
797         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
798 
799         /* Fill the upper triangular matrix */
800         AiUp[0] = (PetscInt)0;
801         AiUp[n] = nzUpper;
802         offset  = 0;
803         for (i = 0; i < n; i++) {
804           /* set the pointers */
805           v  = aa + ai[i];
806           vj = aj + ai[i];
807           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
808 
809           /* first, set the diagonal elements */
810           AjUp[offset] = (PetscInt)i;
811           AAUp[offset] = (MatScalar)1.0 / v[nz];
812           AiUp[i]      = offset;
813           AALo[offset] = (MatScalar)1.0 / v[nz];
814 
815           offset += 1;
816           if (nz > 0) {
817             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819             for (j = offset; j < offset + nz; j++) {
820               AAUp[j] = -AAUp[j];
821               AALo[j] = AAUp[j] / v[nz];
822             }
823             offset += nz;
824           }
825         }
826 
827         /* allocate space for the triangular factor information */
828         PetscCall(PetscNew(&upTriFactor));
829         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
830 
831         /* Create the matrix description */
832         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
833         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
834   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
835         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836   #else
837         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838   #endif
839         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
840         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
841 
842         /* set the matrix */
843         upTriFactor->csrMat              = new CsrMatrix;
844         upTriFactor->csrMat->num_rows    = A->rmap->n;
845         upTriFactor->csrMat->num_cols    = A->cmap->n;
846         upTriFactor->csrMat->num_entries = a->nz;
847 
848         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
850 
851         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
853 
854         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
856 
857         /* set the operation */
858         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859 
860         /* Create the solve analysis information */
861         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
863   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
864         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
865                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
866         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867   #endif
868 
869         /* perform the solve analysis */
870         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
871                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
872 
873         PetscCallCUDA(WaitForCUDA());
874         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
875 
876         /* assign the pointer */
877         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
878 
879         /* allocate space for the triangular factor information */
880         PetscCall(PetscNew(&loTriFactor));
881         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
882 
883         /* Create the matrix description */
884         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
885         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
886   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888   #else
889         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890   #endif
891         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
892         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
893 
894         /* set the operation */
895         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
896 
897         /* set the matrix */
898         loTriFactor->csrMat              = new CsrMatrix;
899         loTriFactor->csrMat->num_rows    = A->rmap->n;
900         loTriFactor->csrMat->num_cols    = A->cmap->n;
901         loTriFactor->csrMat->num_entries = a->nz;
902 
903         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
905 
906         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
908 
909         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
911 
912         /* Create the solve analysis information */
913         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
915   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
916         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
917                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
918         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919   #endif
920 
921         /* perform the solve analysis */
922         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
923                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
924 
925         PetscCallCUDA(WaitForCUDA());
926         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
927 
928         /* assign the pointer */
929         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
930 
931         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
932         PetscCallCUDA(cudaFreeHost(AiUp));
933         PetscCallCUDA(cudaFreeHost(AjUp));
934       } else {
935         /* Fill the upper triangular matrix */
936         offset = 0;
937         for (i = 0; i < n; i++) {
938           /* set the pointers */
939           v  = aa + ai[i];
940           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
941 
942           /* first, set the diagonal elements */
943           AAUp[offset] = 1.0 / v[nz];
944           AALo[offset] = 1.0 / v[nz];
945 
946           offset += 1;
947           if (nz > 0) {
948             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949             for (j = offset; j < offset + nz; j++) {
950               AAUp[j] = -AAUp[j];
951               AALo[j] = AAUp[j] / v[nz];
952             }
953             offset += nz;
954           }
955         }
956         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
960         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961       }
962       PetscCallCUDA(cudaFreeHost(AAUp));
963       PetscCallCUDA(cudaFreeHost(AALo));
964     } catch (char *ex) {
965       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966     }
967   }
968   PetscFunctionReturn(PETSC_SUCCESS);
969 }
970 #endif
971 
972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973 {
974   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
975   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976   IS                            ip                 = a->row;
977   PetscBool                     perm_identity;
978   PetscInt                      n = A->rmap->n;
979 
980   PetscFunctionBegin;
981   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
982 
983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
985 #else
986   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988 #endif
989   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
990 
991   A->offloadmask = PETSC_OFFLOAD_BOTH;
992 
993   /* lower triangular indices */
994   PetscCall(ISIdentity(ip, &perm_identity));
995   if (!perm_identity) {
996     IS              iip;
997     const PetscInt *irip, *rip;
998 
999     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1000     PetscCall(ISGetIndices(iip, &irip));
1001     PetscCall(ISGetIndices(ip, &rip));
1002     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1005     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1006     PetscCall(ISRestoreIndices(iip, &irip));
1007     PetscCall(ISDestroy(&iip));
1008     PetscCall(ISRestoreIndices(ip, &rip));
1009     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010   }
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015 {
1016   PetscFunctionBegin;
1017   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1018   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019   B->offloadmask = PETSC_OFFLOAD_CPU;
1020 
1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024 #else
1025   /* determine which version of MatSolve needs to be used. */
1026   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1027   IS          ip = b->row;
1028   PetscBool   perm_identity;
1029 
1030   PetscCall(ISIdentity(ip, &perm_identity));
1031   if (perm_identity) {
1032     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034   } else {
1035     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1036     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037   }
1038 #endif
1039   B->ops->matsolve          = NULL;
1040   B->ops->matsolvetranspose = NULL;
1041 
1042   /* get the triangular factors */
1043   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1044   PetscFunctionReturn(PETSC_SUCCESS);
1045 }
1046 
1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049 {
1050   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055   cusparseIndexBase_t                indexBase;
1056   cusparseMatrixType_t               matrixType;
1057   cusparseFillMode_t                 fillMode;
1058   cusparseDiagType_t                 diagType;
1059 
1060   PetscFunctionBegin;
1061   /* allocate space for the transpose of the lower triangular factor */
1062   PetscCall(PetscNew(&loTriFactorT));
1063   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1064 
1065   /* set the matrix descriptors of the lower triangular factor */
1066   matrixType = cusparseGetMatType(loTriFactor->descr);
1067   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1068   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1070 
1071   /* Create the matrix description */
1072   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1073   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1074   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1075   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1076   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1077 
1078   /* set the operation */
1079   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1080 
1081   /* allocate GPU space for the CSC of the lower triangular factor*/
1082   loTriFactorT->csrMat                 = new CsrMatrix;
1083   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1084   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1085   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1086   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1089 
1090   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1092   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1093                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1095   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096   #endif
1097 
1098   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1099   {
1100     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1101     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1102                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1104                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105   #else
1106                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107   #endif
1108     PetscCallCUSPARSE(stat);
1109   }
1110 
1111   PetscCallCUDA(WaitForCUDA());
1112   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1113 
1114   /* Create the solve analysis information */
1115   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1117   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1118   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1119                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1120   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121   #endif
1122 
1123   /* perform the solve analysis */
1124   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1125                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1126 
1127   PetscCallCUDA(WaitForCUDA());
1128   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1129 
1130   /* assign the pointer */
1131   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1132 
1133   /*********************************************/
1134   /* Now the Transpose of the Upper Tri Factor */
1135   /*********************************************/
1136 
1137   /* allocate space for the transpose of the upper triangular factor */
1138   PetscCall(PetscNew(&upTriFactorT));
1139   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1140 
1141   /* set the matrix descriptors of the upper triangular factor */
1142   matrixType = cusparseGetMatType(upTriFactor->descr);
1143   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1144   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1146 
1147   /* Create the matrix description */
1148   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1149   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1150   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1151   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1152   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1153 
1154   /* set the operation */
1155   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1156 
1157   /* allocate GPU space for the CSC of the upper triangular factor*/
1158   upTriFactorT->csrMat                 = new CsrMatrix;
1159   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1160   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1161   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1162   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1165 
1166   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1168   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1169                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1171   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172   #endif
1173 
1174   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1175   {
1176     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1177     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1178                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1180                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181   #else
1182                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183   #endif
1184     PetscCallCUSPARSE(stat);
1185   }
1186 
1187   PetscCallCUDA(WaitForCUDA());
1188   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1189 
1190   /* Create the solve analysis information */
1191   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1193   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1194   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1195                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1196   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197   #endif
1198 
1199   /* perform the solve analysis */
1200   /* christ, would it have killed you to put this stuff in a function????????? */
1201   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1202                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1203 
1204   PetscCallCUDA(WaitForCUDA());
1205   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1206 
1207   /* assign the pointer */
1208   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 #endif
1212 
1213 struct PetscScalarToPetscInt {
1214   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215 };
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1222   cusparseStatus_t              stat;
1223   cusparseIndexBase_t           indexBase;
1224 
1225   PetscFunctionBegin;
1226   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1228   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1230   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1231   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1232   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1233   PetscCall(PetscLogGpuTimeBegin());
1234   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1240     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1241 
1242     /* set alpha and beta */
1243     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1246     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1249 
1250     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251       CsrMatrix *matrixT      = new CsrMatrix;
1252       matstructT->mat         = matrixT;
1253       matrixT->num_rows       = A->cmap->n;
1254       matrixT->num_cols       = A->rmap->n;
1255       matrixT->num_entries    = a->nz;
1256       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258       matrixT->values         = new THRUSTARRAY(a->nz);
1259 
1260       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1261       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1262 
1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1264   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1265       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266                                indexBase, cusparse_scalartype);
1267       PetscCallCUSPARSE(stat);
1268   #else
1269       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1270            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1271 
1272            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1273            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1274            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1275         */
1276       if (matrixT->num_entries) {
1277         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1278         PetscCallCUSPARSE(stat);
1279 
1280       } else {
1281         matstructT->matDescr = NULL;
1282         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1283       }
1284   #endif
1285 #endif
1286     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289 #else
1290       CsrMatrix *temp  = new CsrMatrix;
1291       CsrMatrix *tempT = new CsrMatrix;
1292       /* First convert HYB to CSR */
1293       temp->num_rows       = A->rmap->n;
1294       temp->num_cols       = A->cmap->n;
1295       temp->num_entries    = a->nz;
1296       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1297       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298       temp->values         = new THRUSTARRAY(a->nz);
1299 
1300       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1301       PetscCallCUSPARSE(stat);
1302 
1303       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304       tempT->num_rows       = A->rmap->n;
1305       tempT->num_cols       = A->cmap->n;
1306       tempT->num_entries    = a->nz;
1307       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1308       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309       tempT->values         = new THRUSTARRAY(a->nz);
1310 
1311       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1312                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1313       PetscCallCUSPARSE(stat);
1314 
1315       /* Last, convert CSC to HYB */
1316       cusparseHybMat_t hybMat;
1317       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1319       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1320       PetscCallCUSPARSE(stat);
1321 
1322       /* assign the pointer */
1323       matstructT->mat = hybMat;
1324       A->transupdated = PETSC_TRUE;
1325       /* delete temporaries */
1326       if (tempT) {
1327         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330         delete (CsrMatrix *)tempT;
1331       }
1332       if (temp) {
1333         if (temp->values) delete (THRUSTARRAY *)temp->values;
1334         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336         delete (CsrMatrix *)temp;
1337       }
1338 #endif
1339     }
1340   }
1341   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1343     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1344     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1345     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1346     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1347     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1348     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1349     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1350     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1351     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1355       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356     }
1357     if (!cusparsestruct->csr2csc_i) {
1358       THRUSTARRAY csr2csc_a(matrix->num_entries);
1359       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1360 
1361       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363       void  *csr2cscBuffer;
1364       size_t csr2cscBufferSize;
1365       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1366                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1367       PetscCallCUSPARSE(stat);
1368       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369 #endif
1370 
1371       if (matrix->num_entries) {
1372         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375 
1376            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377            should be filled with indexBase. So I just take a shortcut here.
1378         */
1379         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1382         PetscCallCUSPARSE(stat);
1383 #else
1384                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1385         PetscCallCUSPARSE(stat);
1386 #endif
1387       } else {
1388         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1389       }
1390 
1391       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1394       PetscCallCUDA(cudaFree(csr2cscBuffer));
1395 #endif
1396     }
1397     PetscCallThrust(
1398       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399   }
1400   PetscCall(PetscLogGpuTimeEnd());
1401   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402   /* the compressed row indices is not used for matTranspose */
1403   matstructT->cprowIndices = NULL;
1404   /* assign the pointer */
1405   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1406   A->transupdated                                = PETSC_TRUE;
1407   PetscFunctionReturn(PETSC_SUCCESS);
1408 }
1409 
1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412 {
1413   const PetscScalar                    *barray;
1414   PetscScalar                          *xarray;
1415   thrust::device_ptr<const PetscScalar> bGPU;
1416   thrust::device_ptr<PetscScalar>       xGPU;
1417   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421   PetscInt                              m   = A->rmap->n;
1422 
1423   PetscFunctionBegin;
1424   PetscCall(PetscLogGpuTimeBegin());
1425   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426   PetscCall(VecCUDAGetArrayRead(b, &barray));
1427   xGPU = thrust::device_pointer_cast(xarray);
1428   bGPU = thrust::device_pointer_cast(barray);
1429 
1430   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431   if (fs->rpermIndices) {
1432     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434   } else {
1435     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436   }
1437 
1438   // Solve L Y = X
1439   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1442 
1443   // Solve U X = Y
1444   if (fs->cpermIndices) {
1445     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446   } else {
1447     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448   }
1449   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1450 
1451   // Reorder X with the column permutation if needed, and put the result back to x
1452   if (fs->cpermIndices) {
1453     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455   }
1456   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458   PetscCall(PetscLogGpuTimeEnd());
1459   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464 {
1465   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467   const PetscScalar                    *barray;
1468   PetscScalar                          *xarray;
1469   thrust::device_ptr<const PetscScalar> bGPU;
1470   thrust::device_ptr<PetscScalar>       xGPU;
1471   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1472   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473   PetscInt                              m   = A->rmap->n;
1474 
1475   PetscFunctionBegin;
1476   PetscCall(PetscLogGpuTimeBegin());
1477   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1481 
1482     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487   }
1488 
1489   if (!fs->updatedTransposeSpSVAnalysis) {
1490     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1491 
1492     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494   }
1495 
1496   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497   PetscCall(VecCUDAGetArrayRead(b, &barray));
1498   xGPU = thrust::device_pointer_cast(xarray);
1499   bGPU = thrust::device_pointer_cast(barray);
1500 
1501   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502   if (fs->rpermIndices) {
1503     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505   } else {
1506     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507   }
1508 
1509   // Solve Ut Y = X
1510   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1512 
1513   // Solve Lt X = Y
1514   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516   } else {
1517     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518   }
1519   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1520 
1521   // Reorder X with the column permutation if needed, and put the result back to x
1522   if (fs->cpermIndices) {
1523     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525   }
1526 
1527   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529   PetscCall(PetscLogGpuTimeEnd());
1530   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 #else
1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536 {
1537   PetscInt                              n = xx->map->n;
1538   const PetscScalar                    *barray;
1539   PetscScalar                          *xarray;
1540   thrust::device_ptr<const PetscScalar> bGPU;
1541   thrust::device_ptr<PetscScalar>       xGPU;
1542   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1546 
1547   PetscFunctionBegin;
1548   /* Analyze the matrix and create the transpose ... on the fly */
1549   if (!loTriFactorT && !upTriFactorT) {
1550     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553   }
1554 
1555   /* Get the GPU pointers */
1556   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1557   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558   xGPU = thrust::device_pointer_cast(xarray);
1559   bGPU = thrust::device_pointer_cast(barray);
1560 
1561   PetscCall(PetscLogGpuTimeBegin());
1562   /* First, reorder with the row permutation */
1563   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1564 
1565   /* First, solve U */
1566   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1567                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1568 
1569   /* Then, solve L */
1570   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1571                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1572 
1573   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1575 
1576   /* Copy the temporary to the full solution. */
1577   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1578 
1579   /* restore */
1580   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1581   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1582   PetscCall(PetscLogGpuTimeEnd());
1583   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1584   PetscFunctionReturn(PETSC_SUCCESS);
1585 }
1586 
1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588 {
1589   const PetscScalar                 *barray;
1590   PetscScalar                       *xarray;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1595 
1596   PetscFunctionBegin;
1597   /* Analyze the matrix and create the transpose ... on the fly */
1598   if (!loTriFactorT && !upTriFactorT) {
1599     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602   }
1603 
1604   /* Get the GPU pointers */
1605   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1606   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1607 
1608   PetscCall(PetscLogGpuTimeBegin());
1609   /* First, solve U */
1610   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1611                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1612 
1613   /* Then, solve L */
1614   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1615                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1616 
1617   /* restore */
1618   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1619   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1620   PetscCall(PetscLogGpuTimeEnd());
1621   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1622   PetscFunctionReturn(PETSC_SUCCESS);
1623 }
1624 
1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626 {
1627   const PetscScalar                    *barray;
1628   PetscScalar                          *xarray;
1629   thrust::device_ptr<const PetscScalar> bGPU;
1630   thrust::device_ptr<PetscScalar>       xGPU;
1631   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637   /* Get the GPU pointers */
1638   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1639   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640   xGPU = thrust::device_pointer_cast(xarray);
1641   bGPU = thrust::device_pointer_cast(barray);
1642 
1643   PetscCall(PetscLogGpuTimeBegin());
1644   /* First, reorder with the row permutation */
1645   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1646 
1647   /* Next, solve L */
1648   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1649                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1650 
1651   /* Then, solve U */
1652   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1653                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1654 
1655   /* Last, reorder with the column permutation */
1656   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1657 
1658   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1659   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1660   PetscCall(PetscLogGpuTimeEnd());
1661   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666 {
1667   const PetscScalar                 *barray;
1668   PetscScalar                       *xarray;
1669   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1673 
1674   PetscFunctionBegin;
1675   /* Get the GPU pointers */
1676   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1677   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1678 
1679   PetscCall(PetscLogGpuTimeBegin());
1680   /* First, solve L */
1681   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1682                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1683 
1684   /* Next, solve U */
1685   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1686                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1687 
1688   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1689   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1690   PetscCall(PetscLogGpuTimeEnd());
1691   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1692   PetscFunctionReturn(PETSC_SUCCESS);
1693 }
1694 #endif
1695 
1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698 {
1699   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1701   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702   CsrMatrix                    *Acsr;
1703   PetscInt                      m, nz;
1704   PetscBool                     flg;
1705 
1706   PetscFunctionBegin;
1707   if (PetscDefined(USE_DEBUG)) {
1708     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710   }
1711 
1712   /* Copy A's value to fact */
1713   m  = fact->rmap->n;
1714   nz = aij->nz;
1715   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716   Acsr = (CsrMatrix *)Acusp->mat->mat;
1717   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1718 
1719   PetscCall(PetscLogGpuTimeBegin());
1720   /* Factorize fact inplace */
1721   if (m)
1722     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724   if (PetscDefined(USE_DEBUG)) {
1725     int              numerical_zero;
1726     cusparseStatus_t status;
1727     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729   }
1730 
1731   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1732      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1733   */
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1735 
1736   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1737 
1738   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1739   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740 
1741   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1742   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1743   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1744   fact->ops->matsolve          = NULL;
1745   fact->ops->matsolvetranspose = NULL;
1746   PetscCall(PetscLogGpuTimeEnd());
1747   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1748   PetscFunctionReturn(PETSC_SUCCESS);
1749 }
1750 
1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1752 {
1753   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1754   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1755   PetscInt                      m, nz;
1756 
1757   PetscFunctionBegin;
1758   if (PetscDefined(USE_DEBUG)) {
1759     PetscInt  i;
1760     PetscBool flg, missing;
1761 
1762     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1763     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1764     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1765     PetscCall(MatMissingDiagonal(A, &missing, &i));
1766     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1767   }
1768 
1769   /* Free the old stale stuff */
1770   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1771 
1772   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1773      but they will not be used. Allocate them just for easy debugging.
1774    */
1775   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1776 
1777   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1778   fact->factortype             = MAT_FACTOR_ILU;
1779   fact->info.factor_mallocs    = 0;
1780   fact->info.fill_ratio_given  = info->fill;
1781   fact->info.fill_ratio_needed = 1.0;
1782 
1783   aij->row = NULL;
1784   aij->col = NULL;
1785 
1786   /* ====================================================================== */
1787   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1788   /* We'll do in-place factorization on fact                                */
1789   /* ====================================================================== */
1790   const int *Ai, *Aj;
1791 
1792   m  = fact->rmap->n;
1793   nz = aij->nz;
1794 
1795   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1796   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1797   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1798   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1799   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801 
1802   /* ====================================================================== */
1803   /* Create descriptors for M, L, U                                         */
1804   /* ====================================================================== */
1805   cusparseFillMode_t fillMode;
1806   cusparseDiagType_t diagType;
1807 
1808   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1809   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1810   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1811 
1812   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1813     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1814     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1815     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1816     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1817   */
1818   fillMode = CUSPARSE_FILL_MODE_LOWER;
1819   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1820   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1821   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1822   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1823 
1824   fillMode = CUSPARSE_FILL_MODE_UPPER;
1825   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1826   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1827   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1828   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1829 
1830   /* ========================================================================= */
1831   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1832   /* ========================================================================= */
1833   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1834   if (m)
1835     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1836                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1837 
1838   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1839   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1840 
1841   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1842   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1846 
1847   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1848   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1849 
1850   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1851      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1852      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1853      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1854    */
1855   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1856     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1857     fs->spsvBuffer_L = fs->factBuffer_M;
1858     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1859   } else {
1860     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1861     fs->spsvBuffer_U = fs->factBuffer_M;
1862     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1863   }
1864 
1865   /* ========================================================================== */
1866   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1867   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1868   /* ========================================================================== */
1869   int              structural_zero;
1870   cusparseStatus_t status;
1871 
1872   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1873   if (m)
1874     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1875                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1876   if (PetscDefined(USE_DEBUG)) {
1877     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1878     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1879     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1880   }
1881 
1882   /* Estimate FLOPs of the numeric factorization */
1883   {
1884     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1885     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1886     PetscLogDouble flops = 0.0;
1887 
1888     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1889     Ai    = Aseq->i;
1890     Adiag = Aseq->diag;
1891     for (PetscInt i = 0; i < m; i++) {
1892       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1893         nzRow  = Ai[i + 1] - Ai[i];
1894         nzLeft = Adiag[i] - Ai[i];
1895         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1896           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1897         */
1898         nzLeft = (nzRow - 1) / 2;
1899         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1900       }
1901     }
1902     fs->numericFactFlops = flops;
1903   }
1904   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1905   PetscFunctionReturn(PETSC_SUCCESS);
1906 }
1907 
1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1909 {
1910   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1911   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1912   const PetscScalar            *barray;
1913   PetscScalar                  *xarray;
1914 
1915   PetscFunctionBegin;
1916   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1917   PetscCall(VecCUDAGetArrayRead(b, &barray));
1918   PetscCall(PetscLogGpuTimeBegin());
1919 
1920   /* Solve L*y = b */
1921   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1922   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1923   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1924                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1925 
1926   /* Solve Lt*x = y */
1927   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1928   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1929                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1930 
1931   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1932   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1933 
1934   PetscCall(PetscLogGpuTimeEnd());
1935   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1936   PetscFunctionReturn(PETSC_SUCCESS);
1937 }
1938 
1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1940 {
1941   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1942   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1943   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1944   CsrMatrix                    *Acsr;
1945   PetscInt                      m, nz;
1946   PetscBool                     flg;
1947 
1948   PetscFunctionBegin;
1949   if (PetscDefined(USE_DEBUG)) {
1950     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1952   }
1953 
1954   /* Copy A's value to fact */
1955   m  = fact->rmap->n;
1956   nz = aij->nz;
1957   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1958   Acsr = (CsrMatrix *)Acusp->mat->mat;
1959   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1960 
1961   /* Factorize fact inplace */
1962   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1963      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1964      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1965      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1966      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1967    */
1968   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1969   if (PetscDefined(USE_DEBUG)) {
1970     int              numerical_zero;
1971     cusparseStatus_t status;
1972     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1973     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1974   }
1975 
1976   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1977 
1978   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1979     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1980   */
1981   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1982 
1983   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1984   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1985   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1986   fact->ops->matsolve          = NULL;
1987   fact->ops->matsolvetranspose = NULL;
1988   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1989   PetscFunctionReturn(PETSC_SUCCESS);
1990 }
1991 
1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1993 {
1994   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1995   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1996   PetscInt                      m, nz;
1997 
1998   PetscFunctionBegin;
1999   if (PetscDefined(USE_DEBUG)) {
2000     PetscInt  i;
2001     PetscBool flg, missing;
2002 
2003     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2004     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2005     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2006     PetscCall(MatMissingDiagonal(A, &missing, &i));
2007     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2008   }
2009 
2010   /* Free the old stale stuff */
2011   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2012 
2013   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2014      but they will not be used. Allocate them just for easy debugging.
2015    */
2016   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2017 
2018   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2019   fact->factortype             = MAT_FACTOR_ICC;
2020   fact->info.factor_mallocs    = 0;
2021   fact->info.fill_ratio_given  = info->fill;
2022   fact->info.fill_ratio_needed = 1.0;
2023 
2024   aij->row = NULL;
2025   aij->col = NULL;
2026 
2027   /* ====================================================================== */
2028   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2029   /* We'll do in-place factorization on fact                                */
2030   /* ====================================================================== */
2031   const int *Ai, *Aj;
2032 
2033   m  = fact->rmap->n;
2034   nz = aij->nz;
2035 
2036   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2037   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2038   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2039   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2040   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2042 
2043   /* ====================================================================== */
2044   /* Create mat descriptors for M, L                                        */
2045   /* ====================================================================== */
2046   cusparseFillMode_t fillMode;
2047   cusparseDiagType_t diagType;
2048 
2049   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2050   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2051   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2052 
2053   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2054     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2055     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2056     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2057     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2058   */
2059   fillMode = CUSPARSE_FILL_MODE_LOWER;
2060   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2061   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2062   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2063   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2064 
2065   /* ========================================================================= */
2066   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2067   /* ========================================================================= */
2068   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2069   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2070 
2071   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2072   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2073 
2074   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2075   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2079 
2080   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2081   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2082 
2083   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2084      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2085    */
2086   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2087     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2088     fs->spsvBuffer_L = fs->factBuffer_M;
2089     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2090   } else {
2091     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2092     fs->spsvBuffer_Lt = fs->factBuffer_M;
2093     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2094   }
2095 
2096   /* ========================================================================== */
2097   /* Perform analysis of ic0 on M                                               */
2098   /* The lower triangular part of M has the same sparsity pattern as L          */
2099   /* ========================================================================== */
2100   int              structural_zero;
2101   cusparseStatus_t status;
2102 
2103   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2104   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2105   if (PetscDefined(USE_DEBUG)) {
2106     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2107     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2108     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2109   }
2110 
2111   /* Estimate FLOPs of the numeric factorization */
2112   {
2113     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2114     PetscInt      *Ai, nzRow, nzLeft;
2115     PetscLogDouble flops = 0.0;
2116 
2117     Ai = Aseq->i;
2118     for (PetscInt i = 0; i < m; i++) {
2119       nzRow = Ai[i + 1] - Ai[i];
2120       if (nzRow > 1) {
2121         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2122           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2123         */
2124         nzLeft = (nzRow - 1) / 2;
2125         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2126       }
2127     }
2128     fs->numericFactFlops = flops;
2129   }
2130   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2131   PetscFunctionReturn(PETSC_SUCCESS);
2132 }
2133 #endif
2134 
2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2136 {
2137   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2138   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2139 
2140   PetscFunctionBegin;
2141   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2142   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2143   B->offloadmask = PETSC_OFFLOAD_CPU;
2144 
2145   if (!cusparsestruct->use_cpu_solve) {
2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2147     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2148     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2149 #else
2150     /* determine which version of MatSolve needs to be used. */
2151     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2152     IS          isrow = b->row, iscol = b->col;
2153     PetscBool   row_identity, col_identity;
2154 
2155     PetscCall(ISIdentity(isrow, &row_identity));
2156     PetscCall(ISIdentity(iscol, &col_identity));
2157     if (row_identity && col_identity) {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2160     } else {
2161       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2162       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2163     }
2164 #endif
2165   }
2166   B->ops->matsolve          = NULL;
2167   B->ops->matsolvetranspose = NULL;
2168 
2169   /* get the triangular factors */
2170   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2175 {
2176   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2177 
2178   PetscFunctionBegin;
2179   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2180   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2181   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2182   PetscFunctionReturn(PETSC_SUCCESS);
2183 }
2184 
2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186 {
2187   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2188 
2189   PetscFunctionBegin;
2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2191   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2192   if (cusparseTriFactors->factorizeOnDevice) {
2193     PetscCall(ISIdentity(isrow, &row_identity));
2194     PetscCall(ISIdentity(iscol, &col_identity));
2195   }
2196   if (!info->levels && row_identity && col_identity) {
2197     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2198   } else
2199 #endif
2200   {
2201     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2202     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2203     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2204   }
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2209 {
2210   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2211 
2212   PetscFunctionBegin;
2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2214   PetscBool perm_identity = PETSC_FALSE;
2215   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2216   if (!info->levels && perm_identity) {
2217     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2218   } else
2219 #endif
2220   {
2221     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2222     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2223     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2224   }
2225   PetscFunctionReturn(PETSC_SUCCESS);
2226 }
2227 
2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2229 {
2230   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2231 
2232   PetscFunctionBegin;
2233   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2234   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2235   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2236   PetscFunctionReturn(PETSC_SUCCESS);
2237 }
2238 
2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2240 {
2241   PetscFunctionBegin;
2242   *type = MATSOLVERCUSPARSE;
2243   PetscFunctionReturn(PETSC_SUCCESS);
2244 }
2245 
2246 /*MC
2247   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2248   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2249   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2250   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2251   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2252   algorithms are not recommended. This class does NOT support direct solver operations.
2253 
2254   Level: beginner
2255 
2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2257           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2258 M*/
2259 
2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2261 {
2262   PetscInt  n = A->rmap->n;
2263   PetscBool factOnDevice, factOnHost;
2264   char     *prefix;
2265   char      factPlace[32] = "device"; /* the default */
2266 
2267   PetscFunctionBegin;
2268   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2269   PetscCall(MatSetSizes(*B, n, n, n, n));
2270   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2271   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2272 
2273   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2274   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
2275   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2276   PetscOptionsEnd();
2277   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2278   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2279   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2280   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2281 
2282   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2283   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2284     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2285     if (!A->boundtocpu) {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2288     } else {
2289       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2290       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2291     }
2292     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2293     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2294     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2295   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2296     if (!A->boundtocpu) {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2299     } else {
2300       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2301       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2302     }
2303     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2304     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2305   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2306 
2307   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2308   (*B)->canuseordering = PETSC_TRUE;
2309   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2310   PetscFunctionReturn(PETSC_SUCCESS);
2311 }
2312 
2313 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2314 {
2315   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2316   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2318   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2319 #endif
2320 
2321   PetscFunctionBegin;
2322   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2323     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2324     if (A->factortype == MAT_FACTOR_NONE) {
2325       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2326       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2327     }
2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2329     else if (fs->csrVal) {
2330       /* We have a factorized matrix on device and are able to copy it to host */
2331       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2332     }
2333 #endif
2334     else
2335       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2336     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2337     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2338     A->offloadmask = PETSC_OFFLOAD_BOTH;
2339   }
2340   PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342 
2343 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2344 {
2345   PetscFunctionBegin;
2346   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2347   *array = ((Mat_SeqAIJ *)A->data)->a;
2348   PetscFunctionReturn(PETSC_SUCCESS);
2349 }
2350 
2351 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2352 {
2353   PetscFunctionBegin;
2354   A->offloadmask = PETSC_OFFLOAD_CPU;
2355   *array         = NULL;
2356   PetscFunctionReturn(PETSC_SUCCESS);
2357 }
2358 
2359 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2360 {
2361   PetscFunctionBegin;
2362   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2363   *array = ((Mat_SeqAIJ *)A->data)->a;
2364   PetscFunctionReturn(PETSC_SUCCESS);
2365 }
2366 
2367 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2368 {
2369   PetscFunctionBegin;
2370   *array = NULL;
2371   PetscFunctionReturn(PETSC_SUCCESS);
2372 }
2373 
2374 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375 {
2376   PetscFunctionBegin;
2377   *array = ((Mat_SeqAIJ *)A->data)->a;
2378   PetscFunctionReturn(PETSC_SUCCESS);
2379 }
2380 
2381 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2382 {
2383   PetscFunctionBegin;
2384   A->offloadmask = PETSC_OFFLOAD_CPU;
2385   *array         = NULL;
2386   PetscFunctionReturn(PETSC_SUCCESS);
2387 }
2388 
2389 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2390 {
2391   Mat_SeqAIJCUSPARSE *cusp;
2392   CsrMatrix          *matrix;
2393 
2394   PetscFunctionBegin;
2395   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2396   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2397   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2398   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2399   matrix = (CsrMatrix *)cusp->mat->mat;
2400 
2401   if (i) {
2402 #if !defined(PETSC_USE_64BIT_INDICES)
2403     *i = matrix->row_offsets->data().get();
2404 #else
2405     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2406 #endif
2407   }
2408   if (j) {
2409 #if !defined(PETSC_USE_64BIT_INDICES)
2410     *j = matrix->column_indices->data().get();
2411 #else
2412     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2413 #endif
2414   }
2415   if (a) *a = matrix->values->data().get();
2416   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2417   PetscFunctionReturn(PETSC_SUCCESS);
2418 }
2419 
2420 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2421 {
2422   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2423   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2424   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2425   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2426   cusparseStatus_t              stat;
2427   PetscBool                     both = PETSC_TRUE;
2428 
2429   PetscFunctionBegin;
2430   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2431   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2432     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2433       CsrMatrix *matrix;
2434       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2435 
2436       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2437       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2438       matrix->values->assign(a->a, a->a + a->nz);
2439       PetscCallCUDA(WaitForCUDA());
2440       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2441       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2442       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2443     } else {
2444       PetscInt nnz;
2445       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2446       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2447       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2448       delete cusparsestruct->workVector;
2449       delete cusparsestruct->rowoffsets_gpu;
2450       cusparsestruct->workVector     = NULL;
2451       cusparsestruct->rowoffsets_gpu = NULL;
2452       try {
2453         if (a->compressedrow.use) {
2454           m    = a->compressedrow.nrows;
2455           ii   = a->compressedrow.i;
2456           ridx = a->compressedrow.rindex;
2457         } else {
2458           m    = A->rmap->n;
2459           ii   = a->i;
2460           ridx = NULL;
2461         }
2462         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2463         if (!a->a) {
2464           nnz  = ii[m];
2465           both = PETSC_FALSE;
2466         } else nnz = a->nz;
2467         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2468 
2469         /* create cusparse matrix */
2470         cusparsestruct->nrows = m;
2471         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2472         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2473         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2474         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2475 
2476         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2477         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2478         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2479         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2481         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2482         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2483 
2484         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2485         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2486           /* set the matrix */
2487           CsrMatrix *mat   = new CsrMatrix;
2488           mat->num_rows    = m;
2489           mat->num_cols    = A->cmap->n;
2490           mat->num_entries = nnz;
2491           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2492           mat->row_offsets->assign(ii, ii + m + 1);
2493 
2494           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2495           mat->column_indices->assign(a->j, a->j + nnz);
2496 
2497           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2498           if (a->a) mat->values->assign(a->a, a->a + nnz);
2499 
2500           /* assign the pointer */
2501           matstruct->mat = mat;
2502 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2503           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2504             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2505                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2506             PetscCallCUSPARSE(stat);
2507           }
2508 #endif
2509         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2510 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2511           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2512 #else
2513           CsrMatrix *mat   = new CsrMatrix;
2514           mat->num_rows    = m;
2515           mat->num_cols    = A->cmap->n;
2516           mat->num_entries = nnz;
2517           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2518           mat->row_offsets->assign(ii, ii + m + 1);
2519 
2520           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2521           mat->column_indices->assign(a->j, a->j + nnz);
2522 
2523           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2524           if (a->a) mat->values->assign(a->a, a->a + nnz);
2525 
2526           cusparseHybMat_t hybMat;
2527           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2528           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2529           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2530           PetscCallCUSPARSE(stat);
2531           /* assign the pointer */
2532           matstruct->mat = hybMat;
2533 
2534           if (mat) {
2535             if (mat->values) delete (THRUSTARRAY *)mat->values;
2536             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2537             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2538             delete (CsrMatrix *)mat;
2539           }
2540 #endif
2541         }
2542 
2543         /* assign the compressed row indices */
2544         if (a->compressedrow.use) {
2545           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2546           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2547           matstruct->cprowIndices->assign(ridx, ridx + m);
2548           tmp = m;
2549         } else {
2550           cusparsestruct->workVector = NULL;
2551           matstruct->cprowIndices    = NULL;
2552           tmp                        = 0;
2553         }
2554         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2555 
2556         /* assign the pointer */
2557         cusparsestruct->mat = matstruct;
2558       } catch (char *ex) {
2559         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2560       }
2561       PetscCallCUDA(WaitForCUDA());
2562       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2563       cusparsestruct->nonzerostate = A->nonzerostate;
2564     }
2565     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2566   }
2567   PetscFunctionReturn(PETSC_SUCCESS);
2568 }
2569 
2570 struct VecCUDAPlusEquals {
2571   template <typename Tuple>
2572   __host__ __device__ void operator()(Tuple t)
2573   {
2574     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2575   }
2576 };
2577 
2578 struct VecCUDAEquals {
2579   template <typename Tuple>
2580   __host__ __device__ void operator()(Tuple t)
2581   {
2582     thrust::get<1>(t) = thrust::get<0>(t);
2583   }
2584 };
2585 
2586 struct VecCUDAEqualsReverse {
2587   template <typename Tuple>
2588   __host__ __device__ void operator()(Tuple t)
2589   {
2590     thrust::get<0>(t) = thrust::get<1>(t);
2591   }
2592 };
2593 
2594 struct MatMatCusparse {
2595   PetscBool      cisdense;
2596   PetscScalar   *Bt;
2597   Mat            X;
2598   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2599   PetscLogDouble flops;
2600   CsrMatrix     *Bcsr;
2601 
2602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2603   cusparseSpMatDescr_t matSpBDescr;
2604   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2605   cusparseDnMatDescr_t matBDescr;
2606   cusparseDnMatDescr_t matCDescr;
2607   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2608   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2609   void *dBuffer4;
2610   void *dBuffer5;
2611   #endif
2612   size_t                mmBufferSize;
2613   void                 *mmBuffer;
2614   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2615   cusparseSpGEMMDescr_t spgemmDesc;
2616 #endif
2617 };
2618 
2619 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2620 {
2621   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2622 
2623   PetscFunctionBegin;
2624   PetscCallCUDA(cudaFree(mmdata->Bt));
2625   delete mmdata->Bcsr;
2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2627   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2628   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2629   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2630   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2631   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2632   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2633   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2634   #endif
2635   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2636   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2637 #endif
2638   PetscCall(MatDestroy(&mmdata->X));
2639   PetscCall(PetscFree(data));
2640   PetscFunctionReturn(PETSC_SUCCESS);
2641 }
2642 
2643 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2644 
2645 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2646 {
2647   Mat_Product                  *product = C->product;
2648   Mat                           A, B;
2649   PetscInt                      m, n, blda, clda;
2650   PetscBool                     flg, biscuda;
2651   Mat_SeqAIJCUSPARSE           *cusp;
2652   cusparseStatus_t              stat;
2653   cusparseOperation_t           opA;
2654   const PetscScalar            *barray;
2655   PetscScalar                  *carray;
2656   MatMatCusparse               *mmdata;
2657   Mat_SeqAIJCUSPARSEMultStruct *mat;
2658   CsrMatrix                    *csrmat;
2659 
2660   PetscFunctionBegin;
2661   MatCheckProduct(C, 1);
2662   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2663   mmdata = (MatMatCusparse *)product->data;
2664   A      = product->A;
2665   B      = product->B;
2666   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2667   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2668   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2669      Instead of silently accepting the wrong answer, I prefer to raise the error */
2670   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2671   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2672   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2673   switch (product->type) {
2674   case MATPRODUCT_AB:
2675   case MATPRODUCT_PtAP:
2676     mat = cusp->mat;
2677     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2678     m   = A->rmap->n;
2679     n   = B->cmap->n;
2680     break;
2681   case MATPRODUCT_AtB:
2682     if (!A->form_explicit_transpose) {
2683       mat = cusp->mat;
2684       opA = CUSPARSE_OPERATION_TRANSPOSE;
2685     } else {
2686       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2687       mat = cusp->matTranspose;
2688       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2689     }
2690     m = A->cmap->n;
2691     n = B->cmap->n;
2692     break;
2693   case MATPRODUCT_ABt:
2694   case MATPRODUCT_RARt:
2695     mat = cusp->mat;
2696     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2697     m   = A->rmap->n;
2698     n   = B->rmap->n;
2699     break;
2700   default:
2701     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2702   }
2703   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2704   csrmat = (CsrMatrix *)mat->mat;
2705   /* if the user passed a CPU matrix, copy the data to the GPU */
2706   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2707   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2708   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2709 
2710   PetscCall(MatDenseGetLDA(B, &blda));
2711   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2714   } else {
2715     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2716     PetscCall(MatDenseGetLDA(C, &clda));
2717   }
2718 
2719   PetscCall(PetscLogGpuTimeBegin());
2720 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2721   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2722   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2723   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2724   #else
2725   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2726   #endif
2727 
2728   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2729   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2730     size_t mmBufferSize;
2731     if (mmdata->initialized && mmdata->Blda != blda) {
2732       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2733       mmdata->matBDescr = NULL;
2734     }
2735     if (!mmdata->matBDescr) {
2736       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737       mmdata->Blda = blda;
2738     }
2739 
2740     if (mmdata->initialized && mmdata->Clda != clda) {
2741       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2742       mmdata->matCDescr = NULL;
2743     }
2744     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2745       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2746       mmdata->Clda = clda;
2747     }
2748 
2749   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2750     if (matADescr) {
2751       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2752       matADescr = NULL;
2753     }
2754   #endif
2755 
2756     if (!matADescr) {
2757       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2758                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2759       PetscCallCUSPARSE(stat);
2760     }
2761 
2762     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2763 
2764     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2765       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2766       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2767       mmdata->mmBufferSize = mmBufferSize;
2768     }
2769 
2770   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0
2771     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2772   #endif
2773 
2774     mmdata->initialized = PETSC_TRUE;
2775   } else {
2776     /* to be safe, always update pointers of the mats */
2777     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2778     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2779     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2780   }
2781 
2782   /* do cusparseSpMM, which supports transpose on B */
2783   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2784 #else
2785   PetscInt k;
2786   /* cusparseXcsrmm does not support transpose on B */
2787   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2788     cublasHandle_t cublasv2handle;
2789     cublasStatus_t cerr;
2790 
2791     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2792     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2793     PetscCallCUBLAS(cerr);
2794     blda = B->cmap->n;
2795     k    = B->cmap->n;
2796   } else {
2797     k = B->rmap->n;
2798   }
2799 
2800   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2801   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2802   PetscCallCUSPARSE(stat);
2803 #endif
2804   PetscCall(PetscLogGpuTimeEnd());
2805   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2806   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2807   if (product->type == MATPRODUCT_RARt) {
2808     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2809     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2810   } else if (product->type == MATPRODUCT_PtAP) {
2811     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2812     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2813   } else {
2814     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2815   }
2816   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2817   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2818   PetscFunctionReturn(PETSC_SUCCESS);
2819 }
2820 
2821 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2822 {
2823   Mat_Product        *product = C->product;
2824   Mat                 A, B;
2825   PetscInt            m, n;
2826   PetscBool           cisdense, flg;
2827   MatMatCusparse     *mmdata;
2828   Mat_SeqAIJCUSPARSE *cusp;
2829 
2830   PetscFunctionBegin;
2831   MatCheckProduct(C, 1);
2832   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2833   A = product->A;
2834   B = product->B;
2835   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2836   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2837   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2838   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2839   switch (product->type) {
2840   case MATPRODUCT_AB:
2841     m = A->rmap->n;
2842     n = B->cmap->n;
2843     break;
2844   case MATPRODUCT_AtB:
2845     m = A->cmap->n;
2846     n = B->cmap->n;
2847     break;
2848   case MATPRODUCT_ABt:
2849     m = A->rmap->n;
2850     n = B->rmap->n;
2851     break;
2852   case MATPRODUCT_PtAP:
2853     m = B->cmap->n;
2854     n = B->cmap->n;
2855     break;
2856   case MATPRODUCT_RARt:
2857     m = B->rmap->n;
2858     n = B->rmap->n;
2859     break;
2860   default:
2861     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2862   }
2863   PetscCall(MatSetSizes(C, m, n, m, n));
2864   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2865   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2866   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2867 
2868   /* product data */
2869   PetscCall(PetscNew(&mmdata));
2870   mmdata->cisdense = cisdense;
2871 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2872   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2873   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2874 #endif
2875   /* for these products we need intermediate storage */
2876   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2877     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2878     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2879     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2880       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2881     } else {
2882       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2883     }
2884   }
2885   C->product->data    = mmdata;
2886   C->product->destroy = MatDestroy_MatMatCusparse;
2887 
2888   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2889   PetscFunctionReturn(PETSC_SUCCESS);
2890 }
2891 
2892 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2893 {
2894   Mat_Product                  *product = C->product;
2895   Mat                           A, B;
2896   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2897   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2898   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2899   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2900   PetscBool                     flg;
2901   cusparseStatus_t              stat;
2902   MatProductType                ptype;
2903   MatMatCusparse               *mmdata;
2904 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2905   cusparseSpMatDescr_t BmatSpDescr;
2906 #endif
2907   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2908 
2909   PetscFunctionBegin;
2910   MatCheckProduct(C, 1);
2911   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2912   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2913   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2914   mmdata = (MatMatCusparse *)C->product->data;
2915   A      = product->A;
2916   B      = product->B;
2917   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2918     mmdata->reusesym = PETSC_FALSE;
2919     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2920     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2921     Cmat = Ccusp->mat;
2922     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2923     Ccsr = (CsrMatrix *)Cmat->mat;
2924     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2925     goto finalize;
2926   }
2927   if (!c->nz) goto finalize;
2928   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2929   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2930   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2931   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2932   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2933   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2934   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2935   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2936   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2937   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2938   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2939   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2940   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2941   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2942 
2943   ptype = product->type;
2944   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2945     ptype = MATPRODUCT_AB;
2946     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2947   }
2948   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2949     ptype = MATPRODUCT_AB;
2950     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2951   }
2952   switch (ptype) {
2953   case MATPRODUCT_AB:
2954     Amat = Acusp->mat;
2955     Bmat = Bcusp->mat;
2956     break;
2957   case MATPRODUCT_AtB:
2958     Amat = Acusp->matTranspose;
2959     Bmat = Bcusp->mat;
2960     break;
2961   case MATPRODUCT_ABt:
2962     Amat = Acusp->mat;
2963     Bmat = Bcusp->matTranspose;
2964     break;
2965   default:
2966     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2967   }
2968   Cmat = Ccusp->mat;
2969   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2970   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2971   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2972   Acsr = (CsrMatrix *)Amat->mat;
2973   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2974   Ccsr = (CsrMatrix *)Cmat->mat;
2975   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2976   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2977   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2978   PetscCall(PetscLogGpuTimeBegin());
2979 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2980   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2981   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2982   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2983   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2984   PetscCallCUSPARSE(stat);
2985   #else
2986   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2987   PetscCallCUSPARSE(stat);
2988   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2989   PetscCallCUSPARSE(stat);
2990   #endif
2991 #else
2992   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2993                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2994   PetscCallCUSPARSE(stat);
2995 #endif
2996   PetscCall(PetscLogGpuFlops(mmdata->flops));
2997   PetscCallCUDA(WaitForCUDA());
2998   PetscCall(PetscLogGpuTimeEnd());
2999   C->offloadmask = PETSC_OFFLOAD_GPU;
3000 finalize:
3001   /* shorter version of MatAssemblyEnd_SeqAIJ */
3002   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3003   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3004   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3005   c->reallocs = 0;
3006   C->info.mallocs += 0;
3007   C->info.nz_unneeded = 0;
3008   C->assembled = C->was_assembled = PETSC_TRUE;
3009   C->num_ass++;
3010   PetscFunctionReturn(PETSC_SUCCESS);
3011 }
3012 
3013 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3014 {
3015   Mat_Product                  *product = C->product;
3016   Mat                           A, B;
3017   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3018   Mat_SeqAIJ                   *a, *b, *c;
3019   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3020   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3021   PetscInt                      i, j, m, n, k;
3022   PetscBool                     flg;
3023   cusparseStatus_t              stat;
3024   MatProductType                ptype;
3025   MatMatCusparse               *mmdata;
3026   PetscLogDouble                flops;
3027   PetscBool                     biscompressed, ciscompressed;
3028 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3029   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3030   cusparseSpMatDescr_t BmatSpDescr;
3031 #else
3032   int cnz;
3033 #endif
3034   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3035 
3036   PetscFunctionBegin;
3037   MatCheckProduct(C, 1);
3038   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3039   A = product->A;
3040   B = product->B;
3041   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3042   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3043   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3044   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3045   a = (Mat_SeqAIJ *)A->data;
3046   b = (Mat_SeqAIJ *)B->data;
3047   /* product data */
3048   PetscCall(PetscNew(&mmdata));
3049   C->product->data    = mmdata;
3050   C->product->destroy = MatDestroy_MatMatCusparse;
3051 
3052   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3053   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3054   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3055   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3056   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3057   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3058 
3059   ptype = product->type;
3060   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3061     ptype                                          = MATPRODUCT_AB;
3062     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3063   }
3064   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3065     ptype                                          = MATPRODUCT_AB;
3066     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3067   }
3068   biscompressed = PETSC_FALSE;
3069   ciscompressed = PETSC_FALSE;
3070   switch (ptype) {
3071   case MATPRODUCT_AB:
3072     m    = A->rmap->n;
3073     n    = B->cmap->n;
3074     k    = A->cmap->n;
3075     Amat = Acusp->mat;
3076     Bmat = Bcusp->mat;
3077     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3078     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3079     break;
3080   case MATPRODUCT_AtB:
3081     m = A->cmap->n;
3082     n = B->cmap->n;
3083     k = A->rmap->n;
3084     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3085     Amat = Acusp->matTranspose;
3086     Bmat = Bcusp->mat;
3087     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3088     break;
3089   case MATPRODUCT_ABt:
3090     m = A->rmap->n;
3091     n = B->rmap->n;
3092     k = A->cmap->n;
3093     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3094     Amat = Acusp->mat;
3095     Bmat = Bcusp->matTranspose;
3096     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3097     break;
3098   default:
3099     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3100   }
3101 
3102   /* create cusparse matrix */
3103   PetscCall(MatSetSizes(C, m, n, m, n));
3104   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3105   c     = (Mat_SeqAIJ *)C->data;
3106   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3107   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3108   Ccsr  = new CsrMatrix;
3109 
3110   c->compressedrow.use = ciscompressed;
3111   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3112     c->compressedrow.nrows = a->compressedrow.nrows;
3113     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3114     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3115     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3116     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3117     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3118   } else {
3119     c->compressedrow.nrows  = 0;
3120     c->compressedrow.i      = NULL;
3121     c->compressedrow.rindex = NULL;
3122     Ccusp->workVector       = NULL;
3123     Cmat->cprowIndices      = NULL;
3124   }
3125   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3126   Ccusp->mat        = Cmat;
3127   Ccusp->mat->mat   = Ccsr;
3128   Ccsr->num_rows    = Ccusp->nrows;
3129   Ccsr->num_cols    = n;
3130   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3131   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3132   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3133   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3134   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3135   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3136   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3137   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3138   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3139   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3140   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3141     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3142     c->nz                = 0;
3143     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3144     Ccsr->values         = new THRUSTARRAY(c->nz);
3145     goto finalizesym;
3146   }
3147 
3148   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3149   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3150   Acsr = (CsrMatrix *)Amat->mat;
3151   if (!biscompressed) {
3152     Bcsr = (CsrMatrix *)Bmat->mat;
3153 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3154     BmatSpDescr = Bmat->matDescr;
3155 #endif
3156   } else { /* we need to use row offsets for the full matrix */
3157     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3158     Bcsr                 = new CsrMatrix;
3159     Bcsr->num_rows       = B->rmap->n;
3160     Bcsr->num_cols       = cBcsr->num_cols;
3161     Bcsr->num_entries    = cBcsr->num_entries;
3162     Bcsr->column_indices = cBcsr->column_indices;
3163     Bcsr->values         = cBcsr->values;
3164     if (!Bcusp->rowoffsets_gpu) {
3165       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3166       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3167       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3168     }
3169     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3170     mmdata->Bcsr      = Bcsr;
3171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3172     if (Bcsr->num_rows && Bcsr->num_cols) {
3173       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3174       PetscCallCUSPARSE(stat);
3175     }
3176     BmatSpDescr = mmdata->matSpBDescr;
3177 #endif
3178   }
3179   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3180   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3181   /* precompute flops count */
3182   if (ptype == MATPRODUCT_AB) {
3183     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3184       const PetscInt st = a->i[i];
3185       const PetscInt en = a->i[i + 1];
3186       for (j = st; j < en; j++) {
3187         const PetscInt brow = a->j[j];
3188         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3189       }
3190     }
3191   } else if (ptype == MATPRODUCT_AtB) {
3192     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3193       const PetscInt anzi = a->i[i + 1] - a->i[i];
3194       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3195       flops += (2. * anzi) * bnzi;
3196     }
3197   } else { /* TODO */
3198     flops = 0.;
3199   }
3200 
3201   mmdata->flops = flops;
3202   PetscCall(PetscLogGpuTimeBegin());
3203 
3204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3205   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3206   // cuda-12.2 requires non-null csrRowOffsets
3207   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3208   PetscCallCUSPARSE(stat);
3209   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3210   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3211   {
3212     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3213      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3214   */
3215     void *dBuffer1 = NULL;
3216     void *dBuffer2 = NULL;
3217     void *dBuffer3 = NULL;
3218     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3219     size_t bufferSize1 = 0;
3220     size_t bufferSize2 = 0;
3221     size_t bufferSize3 = 0;
3222     size_t bufferSize4 = 0;
3223     size_t bufferSize5 = 0;
3224 
3225     /* ask bufferSize1 bytes for external memory */
3226     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3227     PetscCallCUSPARSE(stat);
3228     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3229     /* inspect the matrices A and B to understand the memory requirement for the next step */
3230     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3231     PetscCallCUSPARSE(stat);
3232 
3233     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3234     PetscCallCUSPARSE(stat);
3235     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3236     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3237     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3238     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3239     PetscCallCUSPARSE(stat);
3240     PetscCallCUDA(cudaFree(dBuffer1));
3241     PetscCallCUDA(cudaFree(dBuffer2));
3242 
3243     /* get matrix C non-zero entries C_nnz1 */
3244     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3245     c->nz = (PetscInt)C_nnz1;
3246     /* allocate matrix C */
3247     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3248     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3249     Ccsr->values = new THRUSTARRAY(c->nz);
3250     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3251     /* update matC with the new pointers */
3252     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3253     PetscCallCUSPARSE(stat);
3254 
3255     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3256     PetscCallCUSPARSE(stat);
3257     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3258     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3259     PetscCallCUSPARSE(stat);
3260     PetscCallCUDA(cudaFree(dBuffer3));
3261     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3262     PetscCallCUSPARSE(stat);
3263     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3264   }
3265   #else
3266   size_t bufSize2;
3267   /* ask bufferSize bytes for external memory */
3268   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3269   PetscCallCUSPARSE(stat);
3270   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3271   /* inspect the matrices A and B to understand the memory requirement for the next step */
3272   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3273   PetscCallCUSPARSE(stat);
3274   /* ask bufferSize again bytes for external memory */
3275   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3276   PetscCallCUSPARSE(stat);
3277   /* The CUSPARSE documentation is not clear, nor the API
3278      We need both buffers to perform the operations properly!
3279      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3280      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3281      is stored in the descriptor! What a messy API... */
3282   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3283   /* compute the intermediate product of A * B */
3284   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3285   PetscCallCUSPARSE(stat);
3286   /* get matrix C non-zero entries C_nnz1 */
3287   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3288   c->nz = (PetscInt)C_nnz1;
3289   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3290                       mmdata->mmBufferSize / 1024));
3291   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3292   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3293   Ccsr->values = new THRUSTARRAY(c->nz);
3294   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3295   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3296   PetscCallCUSPARSE(stat);
3297   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3298   PetscCallCUSPARSE(stat);
3299   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3300 #else
3301   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3302   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3303                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3304   PetscCallCUSPARSE(stat);
3305   c->nz                = cnz;
3306   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3307   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3308   Ccsr->values = new THRUSTARRAY(c->nz);
3309   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3310 
3311   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3312   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3313      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3314      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3315   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3316                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3317   PetscCallCUSPARSE(stat);
3318 #endif
3319   PetscCall(PetscLogGpuFlops(mmdata->flops));
3320   PetscCall(PetscLogGpuTimeEnd());
3321 finalizesym:
3322   c->free_a = PETSC_TRUE;
3323   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3324   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3325   c->free_ij = PETSC_TRUE;
3326   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3327     PetscInt      *d_i = c->i;
3328     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3329     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3330     ii = *Ccsr->row_offsets;
3331     jj = *Ccsr->column_indices;
3332     if (ciscompressed) d_i = c->compressedrow.i;
3333     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3334     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3335   } else {
3336     PetscInt *d_i = c->i;
3337     if (ciscompressed) d_i = c->compressedrow.i;
3338     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3339     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3340   }
3341   if (ciscompressed) { /* need to expand host row offsets */
3342     PetscInt r = 0;
3343     c->i[0]    = 0;
3344     for (k = 0; k < c->compressedrow.nrows; k++) {
3345       const PetscInt next = c->compressedrow.rindex[k];
3346       const PetscInt old  = c->compressedrow.i[k];
3347       for (; r < next; r++) c->i[r + 1] = old;
3348     }
3349     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3350   }
3351   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3352   PetscCall(PetscMalloc1(m, &c->ilen));
3353   PetscCall(PetscMalloc1(m, &c->imax));
3354   c->maxnz         = c->nz;
3355   c->nonzerorowcnt = 0;
3356   c->rmax          = 0;
3357   for (k = 0; k < m; k++) {
3358     const PetscInt nn = c->i[k + 1] - c->i[k];
3359     c->ilen[k] = c->imax[k] = nn;
3360     c->nonzerorowcnt += (PetscInt)!!nn;
3361     c->rmax = PetscMax(c->rmax, nn);
3362   }
3363   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3364   PetscCall(PetscMalloc1(c->nz, &c->a));
3365   Ccsr->num_entries = c->nz;
3366 
3367   C->nonzerostate++;
3368   PetscCall(PetscLayoutSetUp(C->rmap));
3369   PetscCall(PetscLayoutSetUp(C->cmap));
3370   Ccusp->nonzerostate = C->nonzerostate;
3371   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3372   C->preallocated     = PETSC_TRUE;
3373   C->assembled        = PETSC_FALSE;
3374   C->was_assembled    = PETSC_FALSE;
3375   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3376     mmdata->reusesym = PETSC_TRUE;
3377     C->offloadmask   = PETSC_OFFLOAD_GPU;
3378   }
3379   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3380   PetscFunctionReturn(PETSC_SUCCESS);
3381 }
3382 
3383 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3384 
3385 /* handles sparse or dense B */
3386 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3387 {
3388   Mat_Product *product = mat->product;
3389   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3390 
3391   PetscFunctionBegin;
3392   MatCheckProduct(mat, 1);
3393   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3394   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3395   if (product->type == MATPRODUCT_ABC) {
3396     Ciscusp = PETSC_FALSE;
3397     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3398   }
3399   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3400     PetscBool usecpu = PETSC_FALSE;
3401     switch (product->type) {
3402     case MATPRODUCT_AB:
3403       if (product->api_user) {
3404         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3405         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3406         PetscOptionsEnd();
3407       } else {
3408         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3409         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3410         PetscOptionsEnd();
3411       }
3412       break;
3413     case MATPRODUCT_AtB:
3414       if (product->api_user) {
3415         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3416         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3417         PetscOptionsEnd();
3418       } else {
3419         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3420         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3421         PetscOptionsEnd();
3422       }
3423       break;
3424     case MATPRODUCT_PtAP:
3425       if (product->api_user) {
3426         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3427         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3428         PetscOptionsEnd();
3429       } else {
3430         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3431         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3432         PetscOptionsEnd();
3433       }
3434       break;
3435     case MATPRODUCT_RARt:
3436       if (product->api_user) {
3437         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3438         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3439         PetscOptionsEnd();
3440       } else {
3441         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3442         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3443         PetscOptionsEnd();
3444       }
3445       break;
3446     case MATPRODUCT_ABC:
3447       if (product->api_user) {
3448         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3449         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3450         PetscOptionsEnd();
3451       } else {
3452         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3453         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3454         PetscOptionsEnd();
3455       }
3456       break;
3457     default:
3458       break;
3459     }
3460     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3461   }
3462   /* dispatch */
3463   if (isdense) {
3464     switch (product->type) {
3465     case MATPRODUCT_AB:
3466     case MATPRODUCT_AtB:
3467     case MATPRODUCT_ABt:
3468     case MATPRODUCT_PtAP:
3469     case MATPRODUCT_RARt:
3470       if (product->A->boundtocpu) {
3471         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3472       } else {
3473         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3474       }
3475       break;
3476     case MATPRODUCT_ABC:
3477       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3478       break;
3479     default:
3480       break;
3481     }
3482   } else if (Biscusp && Ciscusp) {
3483     switch (product->type) {
3484     case MATPRODUCT_AB:
3485     case MATPRODUCT_AtB:
3486     case MATPRODUCT_ABt:
3487       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3488       break;
3489     case MATPRODUCT_PtAP:
3490     case MATPRODUCT_RARt:
3491     case MATPRODUCT_ABC:
3492       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3493       break;
3494     default:
3495       break;
3496     }
3497   } else { /* fallback for AIJ */
3498     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3499   }
3500   PetscFunctionReturn(PETSC_SUCCESS);
3501 }
3502 
3503 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3504 {
3505   PetscFunctionBegin;
3506   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3507   PetscFunctionReturn(PETSC_SUCCESS);
3508 }
3509 
3510 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3511 {
3512   PetscFunctionBegin;
3513   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3514   PetscFunctionReturn(PETSC_SUCCESS);
3515 }
3516 
3517 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3518 {
3519   PetscFunctionBegin;
3520   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3521   PetscFunctionReturn(PETSC_SUCCESS);
3522 }
3523 
3524 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3525 {
3526   PetscFunctionBegin;
3527   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3528   PetscFunctionReturn(PETSC_SUCCESS);
3529 }
3530 
3531 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3532 {
3533   PetscFunctionBegin;
3534   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3535   PetscFunctionReturn(PETSC_SUCCESS);
3536 }
3537 
3538 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3539 {
3540   int i = blockIdx.x * blockDim.x + threadIdx.x;
3541   if (i < n) y[idx[i]] += x[i];
3542 }
3543 
3544 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3545 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3546 {
3547   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3548   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3549   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3550   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3551   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3552   PetscBool                     compressed;
3553 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3554   PetscInt nx, ny;
3555 #endif
3556 
3557   PetscFunctionBegin;
3558   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3559   if (!a->nz) {
3560     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3561     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3562     PetscFunctionReturn(PETSC_SUCCESS);
3563   }
3564   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3565   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3566   if (!trans) {
3567     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3568     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3569   } else {
3570     if (herm || !A->form_explicit_transpose) {
3571       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3572       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3573     } else {
3574       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3575       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3576     }
3577   }
3578   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3579   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3580 
3581   try {
3582     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3583     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3584     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3585 
3586     PetscCall(PetscLogGpuTimeBegin());
3587     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3588       /* z = A x + beta y.
3589          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3590          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3591       */
3592       xptr = xarray;
3593       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3594       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3595 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3596       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3597           allocated to accommodate different uses. So we get the length info directly from mat.
3598        */
3599       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3600         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3601         nx             = mat->num_cols; // since y = Ax
3602         ny             = mat->num_rows;
3603       }
3604 #endif
3605     } else {
3606       /* z = A^T x + beta y
3607          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3608          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3609        */
3610       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3611       dptr = zarray;
3612       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3613       if (compressed) { /* Scatter x to work vector */
3614         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3615 
3616         thrust::for_each(
3617 #if PetscDefined(HAVE_THRUST_ASYNC)
3618           thrust::cuda::par.on(PetscDefaultCudaStream),
3619 #endif
3620           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3621           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3622       }
3623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3624       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3625         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3626         nx             = mat->num_rows; // since y = A^T x
3627         ny             = mat->num_cols;
3628       }
3629 #endif
3630     }
3631 
3632     /* csr_spmv does y = alpha op(A) x + beta y */
3633     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3636       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3637   #else
3638       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3639   #endif
3640 
3641       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3642   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3643       if (!matDescr) {
3644         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3645         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3646       }
3647   #endif
3648 
3649       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3650         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3651         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3652         PetscCallCUSPARSE(
3653           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3654         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3655   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3656         PetscCallCUSPARSE(
3657           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3658   #endif
3659         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3660       } else {
3661         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3662         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3663         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3664       }
3665 
3666       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3667 #else
3668       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3669       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3670 #endif
3671     } else {
3672       if (cusparsestruct->nrows) {
3673 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3674         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3675 #else
3676         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3677         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3678 #endif
3679       }
3680     }
3681     PetscCall(PetscLogGpuTimeEnd());
3682 
3683     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3684       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3685         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3686           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3687         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3688           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3689         }
3690       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3691         PetscCall(VecSeq_CUDA::Set(zz, 0));
3692       }
3693 
3694       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3695       if (compressed) {
3696         PetscCall(PetscLogGpuTimeBegin());
3697         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3698            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3699            prevent that. So I just add a ScatterAdd kernel.
3700          */
3701 #if 0
3702         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3703         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3704                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3705                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3706                          VecCUDAPlusEquals());
3707 #else
3708         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3709         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3710 #endif
3711         PetscCall(PetscLogGpuTimeEnd());
3712       }
3713     } else {
3714       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3715     }
3716     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3717     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3718     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3719   } catch (char *ex) {
3720     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3721   }
3722   if (yy) {
3723     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3724   } else {
3725     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3726   }
3727   PetscFunctionReturn(PETSC_SUCCESS);
3728 }
3729 
3730 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3731 {
3732   PetscFunctionBegin;
3733   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3734   PetscFunctionReturn(PETSC_SUCCESS);
3735 }
3736 
3737 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3738 {
3739   PetscFunctionBegin;
3740   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3741   PetscFunctionReturn(PETSC_SUCCESS);
3742 }
3743 
3744 /*@
3745   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3746   (the default parallel PETSc format).
3747 
3748   Collective
3749 
3750   Input Parameters:
3751 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3752 . m    - number of rows
3753 . n    - number of columns
3754 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3755 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3756 
3757   Output Parameter:
3758 . A - the matrix
3759 
3760   Level: intermediate
3761 
3762   Notes:
3763   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3764   calculations. For good matrix assembly performance the user should preallocate the matrix
3765   storage by setting the parameter `nz` (or the array `nnz`).
3766 
3767   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3768   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3769   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3770 
3771   The AIJ format, also called
3772   compressed row storage, is fully compatible with standard Fortran
3773   storage.  That is, the stored row and column indices can begin at
3774   either one (as in Fortran) or zero.
3775 
3776   Specify the preallocated storage with either nz or nnz (not both).
3777   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3778   allocation.
3779 
3780 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3781 @*/
3782 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3783 {
3784   PetscFunctionBegin;
3785   PetscCall(MatCreate(comm, A));
3786   PetscCall(MatSetSizes(*A, m, n, m, n));
3787   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3788   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3789   PetscFunctionReturn(PETSC_SUCCESS);
3790 }
3791 
3792 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3793 {
3794   PetscFunctionBegin;
3795   if (A->factortype == MAT_FACTOR_NONE) {
3796     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3797   } else {
3798     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3799   }
3800   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3801   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3802   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3803   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3804   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3805   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3806   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3807   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3808   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3809   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3810   PetscCall(MatDestroy_SeqAIJ(A));
3811   PetscFunctionReturn(PETSC_SUCCESS);
3812 }
3813 
3814 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3815 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3816 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3817 {
3818   PetscFunctionBegin;
3819   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3820   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3821   PetscFunctionReturn(PETSC_SUCCESS);
3822 }
3823 
3824 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3825 {
3826   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3827   Mat_SeqAIJCUSPARSE *cy;
3828   Mat_SeqAIJCUSPARSE *cx;
3829   PetscScalar        *ay;
3830   const PetscScalar  *ax;
3831   CsrMatrix          *csry, *csrx;
3832 
3833   PetscFunctionBegin;
3834   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3835   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3836   if (X->ops->axpy != Y->ops->axpy) {
3837     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3838     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3839     PetscFunctionReturn(PETSC_SUCCESS);
3840   }
3841   /* if we are here, it means both matrices are bound to GPU */
3842   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3843   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3844   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3845   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3846   csry = (CsrMatrix *)cy->mat->mat;
3847   csrx = (CsrMatrix *)cx->mat->mat;
3848   /* see if we can turn this into a cublas axpy */
3849   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3850     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3851     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3852     if (eq) str = SAME_NONZERO_PATTERN;
3853   }
3854   /* spgeam is buggy with one column */
3855   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3856 
3857   if (str == SUBSET_NONZERO_PATTERN) {
3858     PetscScalar b = 1.0;
3859 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3860     size_t bufferSize;
3861     void  *buffer;
3862 #endif
3863 
3864     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3865     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3866     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3868     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3869                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3870     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3871     PetscCall(PetscLogGpuTimeBegin());
3872     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3873                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3874     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3875     PetscCall(PetscLogGpuTimeEnd());
3876     PetscCallCUDA(cudaFree(buffer));
3877 #else
3878     PetscCall(PetscLogGpuTimeBegin());
3879     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3880                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3881     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3882     PetscCall(PetscLogGpuTimeEnd());
3883 #endif
3884     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3885     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3886     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3887     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3888   } else if (str == SAME_NONZERO_PATTERN) {
3889     cublasHandle_t cublasv2handle;
3890     PetscBLASInt   one = 1, bnz = 1;
3891 
3892     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3893     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3894     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3895     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3896     PetscCall(PetscLogGpuTimeBegin());
3897     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3898     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3899     PetscCall(PetscLogGpuTimeEnd());
3900     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3901     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3902     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3903   } else {
3904     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3905     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3906   }
3907   PetscFunctionReturn(PETSC_SUCCESS);
3908 }
3909 
3910 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3911 {
3912   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3913   PetscScalar   *ay;
3914   cublasHandle_t cublasv2handle;
3915   PetscBLASInt   one = 1, bnz = 1;
3916 
3917   PetscFunctionBegin;
3918   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3919   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3920   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3921   PetscCall(PetscLogGpuTimeBegin());
3922   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3923   PetscCall(PetscLogGpuFlops(bnz));
3924   PetscCall(PetscLogGpuTimeEnd());
3925   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3926   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3927   PetscFunctionReturn(PETSC_SUCCESS);
3928 }
3929 
3930 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3931 {
3932   PetscBool   both = PETSC_FALSE;
3933   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3934 
3935   PetscFunctionBegin;
3936   if (A->factortype == MAT_FACTOR_NONE) {
3937     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3938     if (spptr->mat) {
3939       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3940       if (matrix->values) {
3941         both = PETSC_TRUE;
3942         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3943       }
3944     }
3945     if (spptr->matTranspose) {
3946       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3947       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3948     }
3949   }
3950   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3951   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3952   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3953   else A->offloadmask = PETSC_OFFLOAD_CPU;
3954   PetscFunctionReturn(PETSC_SUCCESS);
3955 }
3956 
3957 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3958 {
3959   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3960 
3961   PetscFunctionBegin;
3962   if (A->factortype != MAT_FACTOR_NONE) {
3963     A->boundtocpu = flg;
3964     PetscFunctionReturn(PETSC_SUCCESS);
3965   }
3966   if (flg) {
3967     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3968 
3969     A->ops->scale                     = MatScale_SeqAIJ;
3970     A->ops->axpy                      = MatAXPY_SeqAIJ;
3971     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3972     A->ops->mult                      = MatMult_SeqAIJ;
3973     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3974     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3975     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3976     A->ops->multhermitiantranspose    = NULL;
3977     A->ops->multhermitiantransposeadd = NULL;
3978     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3979     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3980     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3981     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3982     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3983     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3984     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3985     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3986   } else {
3987     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3988     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3989     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3990     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3991     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3992     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3993     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3994     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3995     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3996     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3997     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3998     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3999     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4000     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4001     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4002     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4003     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4004 
4005     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4006     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4007     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4008     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4009     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4010     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4011   }
4012   A->boundtocpu = flg;
4013   if (flg && a->inode.size) {
4014     a->inode.use = PETSC_TRUE;
4015   } else {
4016     a->inode.use = PETSC_FALSE;
4017   }
4018   PetscFunctionReturn(PETSC_SUCCESS);
4019 }
4020 
4021 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4022 {
4023   Mat B;
4024 
4025   PetscFunctionBegin;
4026   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4027   if (reuse == MAT_INITIAL_MATRIX) {
4028     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4029   } else if (reuse == MAT_REUSE_MATRIX) {
4030     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4031   }
4032   B = *newmat;
4033 
4034   PetscCall(PetscFree(B->defaultvectype));
4035   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4036 
4037   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4038     if (B->factortype == MAT_FACTOR_NONE) {
4039       Mat_SeqAIJCUSPARSE *spptr;
4040       PetscCall(PetscNew(&spptr));
4041       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4042       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4043       spptr->format = MAT_CUSPARSE_CSR;
4044 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4045   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4046       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4047   #else
4048       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4049   #endif
4050       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4051       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4052 #endif
4053       B->spptr = spptr;
4054     } else {
4055       Mat_SeqAIJCUSPARSETriFactors *spptr;
4056 
4057       PetscCall(PetscNew(&spptr));
4058       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4059       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4060       B->spptr = spptr;
4061     }
4062     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4063   }
4064   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4065   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4066   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4067   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4068   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4069   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4070 
4071   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4072   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4073   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4074 #if defined(PETSC_HAVE_HYPRE)
4075   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4076 #endif
4077   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4078   PetscFunctionReturn(PETSC_SUCCESS);
4079 }
4080 
4081 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4082 {
4083   PetscFunctionBegin;
4084   PetscCall(MatCreate_SeqAIJ(B));
4085   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4086   PetscFunctionReturn(PETSC_SUCCESS);
4087 }
4088 
4089 /*MC
4090    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4091 
4092    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4093    CSR, ELL, or Hybrid format.
4094    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4095 
4096    Options Database Keys:
4097 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4098 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4099                                       Other options include ell (ellpack) or hyb (hybrid).
4100 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4101 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4102 
4103   Level: beginner
4104 
4105 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4106 M*/
4107 
4108 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4109 {
4110   PetscFunctionBegin;
4111   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4112   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4113   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4114   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4115   PetscFunctionReturn(PETSC_SUCCESS);
4116 }
4117 
4118 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4119 {
4120   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4121 
4122   PetscFunctionBegin;
4123   if (cusp) {
4124     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4125     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4126     delete cusp->workVector;
4127     delete cusp->rowoffsets_gpu;
4128     delete cusp->csr2csc_i;
4129     delete cusp->coords;
4130     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4131     PetscCall(PetscFree(mat->spptr));
4132   }
4133   PetscFunctionReturn(PETSC_SUCCESS);
4134 }
4135 
4136 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4137 {
4138   PetscFunctionBegin;
4139   if (*mat) {
4140     delete (*mat)->values;
4141     delete (*mat)->column_indices;
4142     delete (*mat)->row_offsets;
4143     delete *mat;
4144     *mat = 0;
4145   }
4146   PetscFunctionReturn(PETSC_SUCCESS);
4147 }
4148 
4149 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4150 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4151 {
4152   PetscFunctionBegin;
4153   if (*trifactor) {
4154     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4155     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4156     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4157     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4158     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4159   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4160     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4161   #endif
4162     PetscCall(PetscFree(*trifactor));
4163   }
4164   PetscFunctionReturn(PETSC_SUCCESS);
4165 }
4166 #endif
4167 
4168 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4169 {
4170   CsrMatrix *mat;
4171 
4172   PetscFunctionBegin;
4173   if (*matstruct) {
4174     if ((*matstruct)->mat) {
4175       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4177         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4178 #else
4179         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4180         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4181 #endif
4182       } else {
4183         mat = (CsrMatrix *)(*matstruct)->mat;
4184         PetscCall(CsrMatrix_Destroy(&mat));
4185       }
4186     }
4187     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4188     delete (*matstruct)->cprowIndices;
4189     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4190     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4191     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4192 
4193 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4194     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4195     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4196 
4197     for (int i = 0; i < 3; i++) {
4198       if (mdata->cuSpMV[i].initialized) {
4199         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4200         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4201         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4202   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4203         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4204         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4205   #endif
4206       }
4207     }
4208 #endif
4209     delete *matstruct;
4210     *matstruct = NULL;
4211   }
4212   PetscFunctionReturn(PETSC_SUCCESS);
4213 }
4214 
4215 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4216 {
4217   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4218 
4219   PetscFunctionBegin;
4220   if (fs) {
4221 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4222     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4223     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4224     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4225     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4226     delete fs->workVector;
4227     fs->workVector = NULL;
4228 #endif
4229     delete fs->rpermIndices;
4230     delete fs->cpermIndices;
4231     fs->rpermIndices  = NULL;
4232     fs->cpermIndices  = NULL;
4233     fs->init_dev_prop = PETSC_FALSE;
4234 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4235     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4236     PetscCallCUDA(cudaFree(fs->csrColIdx));
4237     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4238     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4239     PetscCallCUDA(cudaFree(fs->csrVal));
4240     PetscCallCUDA(cudaFree(fs->diag));
4241     PetscCallCUDA(cudaFree(fs->X));
4242     PetscCallCUDA(cudaFree(fs->Y));
4243     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4244     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4245     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4246     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4247     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4248     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4249     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4250     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4251     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4252     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4253     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4254     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4255     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4256     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4257     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4258     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4259     PetscCall(PetscFree(fs->csrRowPtr_h));
4260     PetscCall(PetscFree(fs->csrVal_h));
4261     PetscCall(PetscFree(fs->diag_h));
4262     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4263     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4264 #endif
4265   }
4266   PetscFunctionReturn(PETSC_SUCCESS);
4267 }
4268 
4269 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4270 {
4271   PetscFunctionBegin;
4272   if (*trifactors) {
4273     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4274     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4275     PetscCall(PetscFree(*trifactors));
4276   }
4277   PetscFunctionReturn(PETSC_SUCCESS);
4278 }
4279 
4280 struct IJCompare {
4281   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4282   {
4283     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4284     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4285     return false;
4286   }
4287 };
4288 
4289 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4290 {
4291   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4292 
4293   PetscFunctionBegin;
4294   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4295   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4296   if (destroy) {
4297     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4298     delete cusp->csr2csc_i;
4299     cusp->csr2csc_i = NULL;
4300   }
4301   A->transupdated = PETSC_FALSE;
4302   PetscFunctionReturn(PETSC_SUCCESS);
4303 }
4304 
4305 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4306 {
4307   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
4308 
4309   PetscFunctionBegin;
4310   PetscCallCUDA(cudaFree(coo->perm));
4311   PetscCallCUDA(cudaFree(coo->jmap));
4312   PetscCall(PetscFree(coo));
4313   PetscFunctionReturn(PETSC_SUCCESS);
4314 }
4315 
4316 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4317 {
4318   PetscBool            dev_ij = PETSC_FALSE;
4319   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4320   PetscInt            *i, *j;
4321   PetscContainer       container_h;
4322   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4323 
4324   PetscFunctionBegin;
4325   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4326   PetscCall(PetscGetMemType(coo_i, &mtype));
4327   if (PetscMemTypeDevice(mtype)) {
4328     dev_ij = PETSC_TRUE;
4329     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4330     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4331     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4332   } else {
4333     i = coo_i;
4334     j = coo_j;
4335   }
4336 
4337   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4338   if (dev_ij) PetscCall(PetscFree2(i, j));
4339   mat->offloadmask = PETSC_OFFLOAD_CPU;
4340   // Create the GPU memory
4341   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4342 
4343   // Copy the COO struct to device
4344   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4345   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4346   PetscCall(PetscMalloc1(1, &coo_d));
4347   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4348   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4349   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4350   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4351   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4352 
4353   // Put the COO struct in a container and then attach that to the matrix
4354   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4355   PetscFunctionReturn(PETSC_SUCCESS);
4356 }
4357 
4358 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4359 {
4360   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4361   const PetscCount grid_size = gridDim.x * blockDim.x;
4362   for (; i < nnz; i += grid_size) {
4363     PetscScalar sum = 0.0;
4364     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4365     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4366   }
4367 }
4368 
4369 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4370 {
4371   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4372   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4373   PetscCount           Annz = seq->nz;
4374   PetscMemType         memtype;
4375   const PetscScalar   *v1 = v;
4376   PetscScalar         *Aa;
4377   PetscContainer       container;
4378   MatCOOStruct_SeqAIJ *coo;
4379 
4380   PetscFunctionBegin;
4381   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4382 
4383   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4384   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4385 
4386   PetscCall(PetscGetMemType(v, &memtype));
4387   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4388     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4389     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4390   }
4391 
4392   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4393   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4394 
4395   PetscCall(PetscLogGpuTimeBegin());
4396   if (Annz) {
4397     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4398     PetscCallCUDA(cudaPeekAtLastError());
4399   }
4400   PetscCall(PetscLogGpuTimeEnd());
4401 
4402   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4403   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4404 
4405   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4406   PetscFunctionReturn(PETSC_SUCCESS);
4407 }
4408 
4409 /*@C
4410   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4411 
4412   Not Collective
4413 
4414   Input Parameters:
4415 + A          - the matrix
4416 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4417 
4418   Output Parameters:
4419 + i - the CSR row pointers
4420 - j - the CSR column indices
4421 
4422   Level: developer
4423 
4424   Note:
4425   When compressed is true, the CSR structure does not contain empty rows
4426 
4427 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4428 @*/
4429 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4430 {
4431   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4432   CsrMatrix          *csr;
4433   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4434 
4435   PetscFunctionBegin;
4436   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4437   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4438   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4439   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4440   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4441   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4442   csr = (CsrMatrix *)cusp->mat->mat;
4443   if (i) {
4444     if (!compressed && a->compressedrow.use) { /* need full row offset */
4445       if (!cusp->rowoffsets_gpu) {
4446         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4447         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4448         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4449       }
4450       *i = cusp->rowoffsets_gpu->data().get();
4451     } else *i = csr->row_offsets->data().get();
4452   }
4453   if (j) *j = csr->column_indices->data().get();
4454   PetscFunctionReturn(PETSC_SUCCESS);
4455 }
4456 
4457 /*@C
4458   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4459 
4460   Not Collective
4461 
4462   Input Parameters:
4463 + A          - the matrix
4464 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4465 . i          - the CSR row pointers
4466 - j          - the CSR column indices
4467 
4468   Level: developer
4469 
4470 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4471 @*/
4472 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4473 {
4474   PetscFunctionBegin;
4475   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4476   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4477   if (i) *i = NULL;
4478   if (j) *j = NULL;
4479   (void)compressed;
4480   PetscFunctionReturn(PETSC_SUCCESS);
4481 }
4482 
4483 /*@C
4484   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4485 
4486   Not Collective
4487 
4488   Input Parameter:
4489 . A - a `MATSEQAIJCUSPARSE` matrix
4490 
4491   Output Parameter:
4492 . a - pointer to the device data
4493 
4494   Level: developer
4495 
4496   Note:
4497   May trigger host-device copies if up-to-date matrix data is on host
4498 
4499 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4500 @*/
4501 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4502 {
4503   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4504   CsrMatrix          *csr;
4505 
4506   PetscFunctionBegin;
4507   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4508   PetscAssertPointer(a, 2);
4509   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4510   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4511   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4512   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4513   csr = (CsrMatrix *)cusp->mat->mat;
4514   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4515   *a = csr->values->data().get();
4516   PetscFunctionReturn(PETSC_SUCCESS);
4517 }
4518 
4519 /*@C
4520   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4521 
4522   Not Collective
4523 
4524   Input Parameters:
4525 + A - a `MATSEQAIJCUSPARSE` matrix
4526 - a - pointer to the device data
4527 
4528   Level: developer
4529 
4530 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4531 @*/
4532 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4533 {
4534   PetscFunctionBegin;
4535   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4536   PetscAssertPointer(a, 2);
4537   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4538   *a = NULL;
4539   PetscFunctionReturn(PETSC_SUCCESS);
4540 }
4541 
4542 /*@C
4543   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4544 
4545   Not Collective
4546 
4547   Input Parameter:
4548 . A - a `MATSEQAIJCUSPARSE` matrix
4549 
4550   Output Parameter:
4551 . a - pointer to the device data
4552 
4553   Level: developer
4554 
4555   Note:
4556   May trigger host-device copies if up-to-date matrix data is on host
4557 
4558 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4559 @*/
4560 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4561 {
4562   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4563   CsrMatrix          *csr;
4564 
4565   PetscFunctionBegin;
4566   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4567   PetscAssertPointer(a, 2);
4568   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4569   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4570   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4571   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4572   csr = (CsrMatrix *)cusp->mat->mat;
4573   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4574   *a             = csr->values->data().get();
4575   A->offloadmask = PETSC_OFFLOAD_GPU;
4576   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4577   PetscFunctionReturn(PETSC_SUCCESS);
4578 }
4579 /*@C
4580   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4581 
4582   Not Collective
4583 
4584   Input Parameters:
4585 + A - a `MATSEQAIJCUSPARSE` matrix
4586 - a - pointer to the device data
4587 
4588   Level: developer
4589 
4590 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4591 @*/
4592 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4593 {
4594   PetscFunctionBegin;
4595   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4596   PetscAssertPointer(a, 2);
4597   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4598   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4599   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4600   *a = NULL;
4601   PetscFunctionReturn(PETSC_SUCCESS);
4602 }
4603 
4604 /*@C
4605   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4606 
4607   Not Collective
4608 
4609   Input Parameter:
4610 . A - a `MATSEQAIJCUSPARSE` matrix
4611 
4612   Output Parameter:
4613 . a - pointer to the device data
4614 
4615   Level: developer
4616 
4617   Note:
4618   Does not trigger host-device copies and flags data validity on the GPU
4619 
4620 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4621 @*/
4622 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4623 {
4624   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4625   CsrMatrix          *csr;
4626 
4627   PetscFunctionBegin;
4628   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4629   PetscAssertPointer(a, 2);
4630   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4631   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4632   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4633   csr = (CsrMatrix *)cusp->mat->mat;
4634   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4635   *a             = csr->values->data().get();
4636   A->offloadmask = PETSC_OFFLOAD_GPU;
4637   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4638   PetscFunctionReturn(PETSC_SUCCESS);
4639 }
4640 
4641 /*@C
4642   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4643 
4644   Not Collective
4645 
4646   Input Parameters:
4647 + A - a `MATSEQAIJCUSPARSE` matrix
4648 - a - pointer to the device data
4649 
4650   Level: developer
4651 
4652 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4653 @*/
4654 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4655 {
4656   PetscFunctionBegin;
4657   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4658   PetscAssertPointer(a, 2);
4659   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4660   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4661   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4662   *a = NULL;
4663   PetscFunctionReturn(PETSC_SUCCESS);
4664 }
4665 
4666 struct IJCompare4 {
4667   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4668   {
4669     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4670     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4671     return false;
4672   }
4673 };
4674 
4675 struct Shift {
4676   int _shift;
4677 
4678   Shift(int shift) : _shift(shift) { }
4679   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4680 };
4681 
4682 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4683 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4684 {
4685   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4686   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4687   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4688   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4689   PetscInt                      Annz, Bnnz;
4690   cusparseStatus_t              stat;
4691   PetscInt                      i, m, n, zero = 0;
4692 
4693   PetscFunctionBegin;
4694   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4695   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4696   PetscAssertPointer(C, 4);
4697   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4698   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4699   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4700   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4701   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4702   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4703   if (reuse == MAT_INITIAL_MATRIX) {
4704     m = A->rmap->n;
4705     n = A->cmap->n + B->cmap->n;
4706     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4707     PetscCall(MatSetSizes(*C, m, n, m, n));
4708     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4709     c                       = (Mat_SeqAIJ *)(*C)->data;
4710     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4711     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4712     Ccsr                    = new CsrMatrix;
4713     Cmat->cprowIndices      = NULL;
4714     c->compressedrow.use    = PETSC_FALSE;
4715     c->compressedrow.nrows  = 0;
4716     c->compressedrow.i      = NULL;
4717     c->compressedrow.rindex = NULL;
4718     Ccusp->workVector       = NULL;
4719     Ccusp->nrows            = m;
4720     Ccusp->mat              = Cmat;
4721     Ccusp->mat->mat         = Ccsr;
4722     Ccsr->num_rows          = m;
4723     Ccsr->num_cols          = n;
4724     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4725     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4726     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4727     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4728     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4729     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4730     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4731     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4732     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4733     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4734     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4735     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4736     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4737 
4738     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4739     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4740     Annz                 = (PetscInt)Acsr->column_indices->size();
4741     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4742     c->nz                = Annz + Bnnz;
4743     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4744     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4745     Ccsr->values         = new THRUSTARRAY(c->nz);
4746     Ccsr->num_entries    = c->nz;
4747     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4748     if (c->nz) {
4749       auto              Acoo = new THRUSTINTARRAY32(Annz);
4750       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4751       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4752       THRUSTINTARRAY32 *Aroff, *Broff;
4753 
4754       if (a->compressedrow.use) { /* need full row offset */
4755         if (!Acusp->rowoffsets_gpu) {
4756           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4757           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4758           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4759         }
4760         Aroff = Acusp->rowoffsets_gpu;
4761       } else Aroff = Acsr->row_offsets;
4762       if (b->compressedrow.use) { /* need full row offset */
4763         if (!Bcusp->rowoffsets_gpu) {
4764           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4765           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4766           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4767         }
4768         Broff = Bcusp->rowoffsets_gpu;
4769       } else Broff = Bcsr->row_offsets;
4770       PetscCall(PetscLogGpuTimeBegin());
4771       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4772       PetscCallCUSPARSE(stat);
4773       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4774       PetscCallCUSPARSE(stat);
4775       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4776       auto Aperm = thrust::make_constant_iterator(1);
4777       auto Bperm = thrust::make_constant_iterator(0);
4778 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4779       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4780       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4781 #else
4782       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4783       auto Bcib = Bcsr->column_indices->begin();
4784       auto Bcie = Bcsr->column_indices->end();
4785       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4786 #endif
4787       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4788       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4789       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4790       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4791       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4792       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4793       auto p1    = Ccusp->coords->begin();
4794       auto p2    = Ccusp->coords->begin();
4795       thrust::advance(p2, Annz);
4796       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4797 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4798       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4799 #endif
4800       auto cci = thrust::make_counting_iterator(zero);
4801       auto cce = thrust::make_counting_iterator(c->nz);
4802 #if 0 //Errors on SUMMIT cuda 11.1.0
4803       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4804 #else
4805       auto pred = thrust::identity<int>();
4806       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4807       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4808 #endif
4809       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4810       PetscCallCUSPARSE(stat);
4811       PetscCall(PetscLogGpuTimeEnd());
4812       delete wPerm;
4813       delete Acoo;
4814       delete Bcoo;
4815       delete Ccoo;
4816 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4817       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4818       PetscCallCUSPARSE(stat);
4819 #endif
4820       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4821         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4822         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4823         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4824         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4825         CsrMatrix                    *CcsrT = new CsrMatrix;
4826         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4827         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4828 
4829         (*C)->form_explicit_transpose = PETSC_TRUE;
4830         (*C)->transupdated            = PETSC_TRUE;
4831         Ccusp->rowoffsets_gpu         = NULL;
4832         CmatT->cprowIndices           = NULL;
4833         CmatT->mat                    = CcsrT;
4834         CcsrT->num_rows               = n;
4835         CcsrT->num_cols               = m;
4836         CcsrT->num_entries            = c->nz;
4837 
4838         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4839         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4840         CcsrT->values         = new THRUSTARRAY(c->nz);
4841 
4842         PetscCall(PetscLogGpuTimeBegin());
4843         auto rT = CcsrT->row_offsets->begin();
4844         if (AT) {
4845           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4846           thrust::advance(rT, -1);
4847         }
4848         if (BT) {
4849           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4850           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4851           thrust::copy(titb, tite, rT);
4852         }
4853         auto cT = CcsrT->column_indices->begin();
4854         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4855         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4856         auto vT = CcsrT->values->begin();
4857         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4858         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4859         PetscCall(PetscLogGpuTimeEnd());
4860 
4861         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4862         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4863         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4864         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4865         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4866         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4867         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4868         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4869         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4871         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4872         PetscCallCUSPARSE(stat);
4873 #endif
4874         Ccusp->matTranspose = CmatT;
4875       }
4876     }
4877 
4878     c->free_a = PETSC_TRUE;
4879     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4880     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4881     c->free_ij = PETSC_TRUE;
4882     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4883       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4884       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4885       ii = *Ccsr->row_offsets;
4886       jj = *Ccsr->column_indices;
4887       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4888       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4889     } else {
4890       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4891       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4892     }
4893     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4894     PetscCall(PetscMalloc1(m, &c->ilen));
4895     PetscCall(PetscMalloc1(m, &c->imax));
4896     c->maxnz         = c->nz;
4897     c->nonzerorowcnt = 0;
4898     c->rmax          = 0;
4899     for (i = 0; i < m; i++) {
4900       const PetscInt nn = c->i[i + 1] - c->i[i];
4901       c->ilen[i] = c->imax[i] = nn;
4902       c->nonzerorowcnt += (PetscInt)!!nn;
4903       c->rmax = PetscMax(c->rmax, nn);
4904     }
4905     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4906     PetscCall(PetscMalloc1(c->nz, &c->a));
4907     (*C)->nonzerostate++;
4908     PetscCall(PetscLayoutSetUp((*C)->rmap));
4909     PetscCall(PetscLayoutSetUp((*C)->cmap));
4910     Ccusp->nonzerostate = (*C)->nonzerostate;
4911     (*C)->preallocated  = PETSC_TRUE;
4912   } else {
4913     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4914     c = (Mat_SeqAIJ *)(*C)->data;
4915     if (c->nz) {
4916       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4917       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4918       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4919       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4920       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4921       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4922       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4923       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4924       Acsr = (CsrMatrix *)Acusp->mat->mat;
4925       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4926       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4927       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4928       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4929       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4930       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4931       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4932       auto pmid = Ccusp->coords->begin();
4933       thrust::advance(pmid, Acsr->num_entries);
4934       PetscCall(PetscLogGpuTimeBegin());
4935       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4936       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4937       thrust::for_each(zibait, zieait, VecCUDAEquals());
4938       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4939       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4940       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4941       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4942       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4943         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4944         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4945         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4946         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4947         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4948         auto       vT    = CcsrT->values->begin();
4949         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4950         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4951         (*C)->transupdated = PETSC_TRUE;
4952       }
4953       PetscCall(PetscLogGpuTimeEnd());
4954     }
4955   }
4956   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4957   (*C)->assembled     = PETSC_TRUE;
4958   (*C)->was_assembled = PETSC_FALSE;
4959   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4960   PetscFunctionReturn(PETSC_SUCCESS);
4961 }
4962 
4963 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4964 {
4965   bool               dmem;
4966   const PetscScalar *av;
4967 
4968   PetscFunctionBegin;
4969   dmem = isCudaMem(v);
4970   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4971   if (n && idx) {
4972     THRUSTINTARRAY widx(n);
4973     widx.assign(idx, idx + n);
4974     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4975 
4976     THRUSTARRAY                    *w = NULL;
4977     thrust::device_ptr<PetscScalar> dv;
4978     if (dmem) {
4979       dv = thrust::device_pointer_cast(v);
4980     } else {
4981       w  = new THRUSTARRAY(n);
4982       dv = w->data();
4983     }
4984     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4985 
4986     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4987     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4988     thrust::for_each(zibit, zieit, VecCUDAEquals());
4989     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4990     delete w;
4991   } else {
4992     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4993   }
4994   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4995   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4996   PetscFunctionReturn(PETSC_SUCCESS);
4997 }
4998 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
4999