xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision fc905784864101ddc2607c2316f27d6aeb8add2a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72 #endif
73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
83 
84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95 
96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97 {
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
99 
100   PetscFunctionBegin;
101   switch (op) {
102   case MAT_CUSPARSE_MULT:
103     cusparsestruct->format = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparsestruct->format = format;
107     break;
108   default:
109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*@
115    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116    operation. Only the `MatMult()` operation can use different GPU storage formats
117 
118    Not Collective
119 
120    Input Parameters:
121 +  A - Matrix of type `MATSEQAIJCUSPARSE`
122 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125 
126    Level: intermediate
127 
128 .seealso: [](ch_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149 
150    Input Parameters:
151 +  A - Matrix of type `MATSEQAIJCUSPARSE`
152 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
153 
154    Level: intermediate
155 
156    Note:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162 @*/
163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164 {
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172 {
173   PetscFunctionBegin;
174   switch (op) {
175   case MAT_FORM_EXPLICIT_TRANSPOSE:
176     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178     A->form_explicit_transpose = flg;
179     break;
180   default:
181     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182     break;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188 {
189   MatCUSPARSEStorageFormat format;
190   PetscBool                flg;
191   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
192 
193   PetscFunctionBegin;
194   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195   if (A->factortype == MAT_FACTOR_NONE) {
196     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198 
199     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208   #else
209     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #endif
211     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213 
214     PetscCall(
215       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217 #endif
218   }
219   PetscOptionsHeadEnd();
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225 {
226   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227   PetscInt                      m  = A->rmap->n;
228   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230   const MatScalar              *Aa = a->a;
231   PetscInt                     *Mi, *Mj, Mnz;
232   PetscScalar                  *Ma;
233 
234   PetscFunctionBegin;
235   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239       PetscCall(PetscMalloc1(m + 1, &Mi));
240       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241       PetscCall(PetscMalloc1(Mnz, &Ma));
242       Mi[0] = 0;
243       for (PetscInt i = 0; i < m; i++) {
244         PetscInt llen = Ai[i + 1] - Ai[i];
245         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249         Mi[i + 1] = Mi[i] + llen + ulen;
250       }
251       // Copy M (L,U) from host to device
252       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
257 
258       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266 
267       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270 
271       fillMode = CUSPARSE_FILL_MODE_UPPER;
272       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276 
277       // Allocate work vectors in SpSv
278       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
280 
281       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283 
284       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291 
292       // Record for reuse
293       fs->csrRowPtr_h = Mi;
294       fs->csrVal_h    = Ma;
295       PetscCall(PetscFree(Mj));
296     }
297     // Copy the value
298     Mi  = fs->csrRowPtr_h;
299     Ma  = fs->csrVal_h;
300     Mnz = Mi[m];
301     for (PetscInt i = 0; i < m; i++) {
302       PetscInt llen = Ai[i + 1] - Ai[i];
303       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307     }
308     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309 
310     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312 
313     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314 
315     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 #else
321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322 {
323   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324   PetscInt                           n                  = A->rmap->n;
325   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328   const MatScalar                   *aa = a->a, *v;
329   PetscInt                          *AiLo, *AjLo;
330   PetscInt                           i, nz, nzLower, offset, rowOffset;
331 
332   PetscFunctionBegin;
333   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335     try {
336       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337       nzLower = n + ai[n] - ai[1];
338       if (!loTriFactor) {
339         PetscScalar *AALo;
340 
341         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
342 
343         /* Allocate Space for the lower triangular matrix */
344         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
346 
347         /* Fill the lower triangular matrix */
348         AiLo[0]   = (PetscInt)0;
349         AiLo[n]   = nzLower;
350         AjLo[0]   = (PetscInt)0;
351         AALo[0]   = (MatScalar)1.0;
352         v         = aa;
353         vi        = aj;
354         offset    = 1;
355         rowOffset = 1;
356         for (i = 1; i < n; i++) {
357           nz = ai[i + 1] - ai[i];
358           /* additional 1 for the term on the diagonal */
359           AiLo[i] = rowOffset;
360           rowOffset += nz + 1;
361 
362           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
364 
365           offset += nz;
366           AjLo[offset] = (PetscInt)i;
367           AALo[offset] = (MatScalar)1.0;
368           offset += 1;
369 
370           v += nz;
371           vi += nz;
372         }
373 
374         /* allocate space for the triangular factor information */
375         PetscCall(PetscNew(&loTriFactor));
376         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377         /* Create the matrix description */
378         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382   #else
383         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384   #endif
385         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387 
388         /* set the operation */
389         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390 
391         /* set the matrix */
392         loTriFactor->csrMat              = new CsrMatrix;
393         loTriFactor->csrMat->num_rows    = n;
394         loTriFactor->csrMat->num_cols    = n;
395         loTriFactor->csrMat->num_entries = nzLower;
396 
397         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399 
400         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402 
403         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405 
406         /* Create the solve analysis information */
407         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413   #endif
414 
415         /* perform the solve analysis */
416         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418         PetscCallCUDA(WaitForCUDA());
419         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420 
421         /* assign the pointer */
422         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423         loTriFactor->AA_h                                          = AALo;
424         PetscCallCUDA(cudaFreeHost(AiLo));
425         PetscCallCUDA(cudaFreeHost(AjLo));
426         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427       } else { /* update values only */
428         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429         /* Fill the lower triangular matrix */
430         loTriFactor->AA_h[0] = 1.0;
431         v                    = aa;
432         vi                   = aj;
433         offset               = 1;
434         for (i = 1; i < n; i++) {
435           nz = ai[i + 1] - ai[i];
436           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437           offset += nz;
438           loTriFactor->AA_h[offset] = 1.0;
439           offset += 1;
440           v += nz;
441         }
442         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444       }
445     } catch (char *ex) {
446       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447     }
448   }
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453 {
454   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455   PetscInt                           n                  = A->rmap->n;
456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459   const MatScalar                   *aa = a->a, *v;
460   PetscInt                          *AiUp, *AjUp;
461   PetscInt                           i, nz, nzUpper, offset;
462 
463   PetscFunctionBegin;
464   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466     try {
467       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468       nzUpper = adiag[0] - adiag[n];
469       if (!upTriFactor) {
470         PetscScalar *AAUp;
471 
472         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
473 
474         /* Allocate Space for the upper triangular matrix */
475         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
477 
478         /* Fill the upper triangular matrix */
479         AiUp[0] = (PetscInt)0;
480         AiUp[n] = nzUpper;
481         offset  = nzUpper;
482         for (i = n - 1; i >= 0; i--) {
483           v  = aa + adiag[i + 1] + 1;
484           vi = aj + adiag[i + 1] + 1;
485 
486           /* number of elements NOT on the diagonal */
487           nz = adiag[i] - adiag[i + 1] - 1;
488 
489           /* decrement the offset */
490           offset -= (nz + 1);
491 
492           /* first, set the diagonal elements */
493           AjUp[offset] = (PetscInt)i;
494           AAUp[offset] = (MatScalar)1. / v[nz];
495           AiUp[i]      = AiUp[i + 1] - (nz + 1);
496 
497           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499         }
500 
501         /* allocate space for the triangular factor information */
502         PetscCall(PetscNew(&upTriFactor));
503         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
504 
505         /* Create the matrix description */
506         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510   #else
511         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512   #endif
513         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515 
516         /* set the operation */
517         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518 
519         /* set the matrix */
520         upTriFactor->csrMat              = new CsrMatrix;
521         upTriFactor->csrMat->num_rows    = n;
522         upTriFactor->csrMat->num_cols    = n;
523         upTriFactor->csrMat->num_entries = nzUpper;
524 
525         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527 
528         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530 
531         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533 
534         /* Create the solve analysis information */
535         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541   #endif
542 
543         /* perform the solve analysis */
544         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
546 
547         PetscCallCUDA(WaitForCUDA());
548         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549 
550         /* assign the pointer */
551         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552         upTriFactor->AA_h                                          = AAUp;
553         PetscCallCUDA(cudaFreeHost(AiUp));
554         PetscCallCUDA(cudaFreeHost(AjUp));
555         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556       } else {
557         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558         /* Fill the upper triangular matrix */
559         offset = nzUpper;
560         for (i = n - 1; i >= 0; i--) {
561           v = aa + adiag[i + 1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i + 1] - 1;
565 
566           /* decrement the offset */
567           offset -= (nz + 1);
568 
569           /* first, set the diagonal elements */
570           upTriFactor->AA_h[offset] = 1. / v[nz];
571           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572         }
573         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575       }
576     } catch (char *ex) {
577       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578     }
579   }
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 #endif
583 
584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585 {
586   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588   IS                            isrow = a->row, iscol = a->icol;
589   PetscBool                     row_identity, col_identity;
590   PetscInt                      n = A->rmap->n;
591 
592   PetscFunctionBegin;
593   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596 #else
597   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600 #endif
601 
602   cusparseTriFactors->nnz = a->nz;
603 
604   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605   /* lower triangular indices */
606   PetscCall(ISIdentity(isrow, &row_identity));
607   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608     const PetscInt *r;
609 
610     PetscCall(ISGetIndices(isrow, &r));
611     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612     cusparseTriFactors->rpermIndices->assign(r, r + n);
613     PetscCall(ISRestoreIndices(isrow, &r));
614     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615   }
616 
617   /* upper triangular indices */
618   PetscCall(ISIdentity(iscol, &col_identity));
619   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620     const PetscInt *c;
621 
622     PetscCall(ISGetIndices(iscol, &c));
623     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624     cusparseTriFactors->cpermIndices->assign(c, c + n);
625     PetscCall(ISRestoreIndices(iscol, &c));
626     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633 {
634   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635   PetscInt                      m  = A->rmap->n;
636   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638   const MatScalar              *Aa = a->a;
639   PetscInt                     *Mj, Mnz;
640   PetscScalar                  *Ma, *D;
641 
642   PetscFunctionBegin;
643   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647       Mnz = Ai[m]; // Unz (with the unit diagonal)
648       PetscCall(PetscMalloc1(Mnz, &Ma));
649       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651       for (PetscInt i = 0; i < m; i++) {
652         PetscInt ulen = Ai[i + 1] - Ai[i];
653         Mj[Ai[i]]     = i;                                              // diagonal entry
654         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655       }
656       // Copy M (U) from host to device
657       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663 
664       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672 
673       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676 
677       // Allocate work vectors in SpSv
678       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
680 
681       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683 
684       // Query buffer sizes for SpSV and then allocate buffers
685       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688 
689       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692 
693       // Record for reuse
694       fs->csrVal_h = Ma;
695       fs->diag_h   = D;
696       PetscCall(PetscFree(Mj));
697     }
698     // Copy the value
699     Ma  = fs->csrVal_h;
700     D   = fs->diag_h;
701     Mnz = Ai[m];
702     for (PetscInt i = 0; i < m; i++) {
703       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706     }
707     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709 
710     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 // Solve Ut D U x = b
718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719 {
720   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722   const PetscScalar                    *barray;
723   PetscScalar                          *xarray;
724   thrust::device_ptr<const PetscScalar> bGPU;
725   thrust::device_ptr<PetscScalar>       xGPU;
726   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727   PetscInt                              m   = A->rmap->n;
728 
729   PetscFunctionBegin;
730   PetscCall(PetscLogGpuTimeBegin());
731   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732   PetscCall(VecCUDAGetArrayRead(b, &barray));
733   xGPU = thrust::device_pointer_cast(xarray);
734   bGPU = thrust::device_pointer_cast(barray);
735 
736   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737   if (fs->rpermIndices) {
738     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740   } else {
741     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742   }
743 
744   // Solve Ut Y = X
745   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747 
748   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749   // It is basically a vector element-wise multiplication, but cublas does not have it!
750   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   /* Factorize fact inplace */
1719   if (m)
1720     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722   if (PetscDefined(USE_DEBUG)) {
1723     int              numerical_zero;
1724     cusparseStatus_t status;
1725     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727   }
1728 
1729   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731   */
1732   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1733 
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1735 
1736   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1738 
1739   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742   fact->ops->matsolve          = NULL;
1743   fact->ops->matsolvetranspose = NULL;
1744   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745   PetscFunctionReturn(PETSC_SUCCESS);
1746 }
1747 
1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749 {
1750   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752   PetscInt                      m, nz;
1753 
1754   PetscFunctionBegin;
1755   if (PetscDefined(USE_DEBUG)) {
1756     PetscInt  i;
1757     PetscBool flg, missing;
1758 
1759     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764   }
1765 
1766   /* Free the old stale stuff */
1767   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1768 
1769   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770      but they will not be used. Allocate them just for easy debugging.
1771    */
1772   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1773 
1774   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775   fact->factortype             = MAT_FACTOR_ILU;
1776   fact->info.factor_mallocs    = 0;
1777   fact->info.fill_ratio_given  = info->fill;
1778   fact->info.fill_ratio_needed = 1.0;
1779 
1780   aij->row = NULL;
1781   aij->col = NULL;
1782 
1783   /* ====================================================================== */
1784   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785   /* We'll do in-place factorization on fact                                */
1786   /* ====================================================================== */
1787   const int *Ai, *Aj;
1788 
1789   m  = fact->rmap->n;
1790   nz = aij->nz;
1791 
1792   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1798 
1799   /* ====================================================================== */
1800   /* Create descriptors for M, L, U                                         */
1801   /* ====================================================================== */
1802   cusparseFillMode_t fillMode;
1803   cusparseDiagType_t diagType;
1804 
1805   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1808 
1809   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814   */
1815   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1820 
1821   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826 
1827   /* ========================================================================= */
1828   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829   /* ========================================================================= */
1830   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831   if (m)
1832     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1834 
1835   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1837 
1838   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1840 
1841   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1846 
1847   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851    */
1852   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854     fs->spsvBuffer_L = fs->factBuffer_M;
1855     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856   } else {
1857     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858     fs->spsvBuffer_U = fs->factBuffer_M;
1859     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860   }
1861 
1862   /* ========================================================================== */
1863   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865   /* ========================================================================== */
1866   int              structural_zero;
1867   cusparseStatus_t status;
1868 
1869   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870   if (m)
1871     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873   if (PetscDefined(USE_DEBUG)) {
1874     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877   }
1878 
1879   /* Estimate FLOPs of the numeric factorization */
1880   {
1881     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1882     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883     PetscLogDouble flops = 0.0;
1884 
1885     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886     Ai    = Aseq->i;
1887     Adiag = Aseq->diag;
1888     for (PetscInt i = 0; i < m; i++) {
1889       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890         nzRow  = Ai[i + 1] - Ai[i];
1891         nzLeft = Adiag[i] - Ai[i];
1892         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894         */
1895         nzLeft = (nzRow - 1) / 2;
1896         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897       }
1898     }
1899     fs->numericFactFlops = flops;
1900   }
1901   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902   PetscFunctionReturn(PETSC_SUCCESS);
1903 }
1904 
1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906 {
1907   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909   const PetscScalar            *barray;
1910   PetscScalar                  *xarray;
1911 
1912   PetscFunctionBegin;
1913   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915   PetscCall(PetscLogGpuTimeBegin());
1916 
1917   /* Solve L*y = b */
1918   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1922 
1923   /* Solve Lt*x = y */
1924   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1927 
1928   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1930 
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937 {
1938   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941   CsrMatrix                    *Acsr;
1942   PetscInt                      m, nz;
1943   PetscBool                     flg;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949   }
1950 
1951   /* Copy A's value to fact */
1952   m  = fact->rmap->n;
1953   nz = aij->nz;
1954   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1957 
1958   /* Factorize fact inplace */
1959   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964    */
1965   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966   if (PetscDefined(USE_DEBUG)) {
1967     int              numerical_zero;
1968     cusparseStatus_t status;
1969     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971   }
1972 
1973   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1974 
1975   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977   */
1978   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1979 
1980   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983   fact->ops->matsolve          = NULL;
1984   fact->ops->matsolvetranspose = NULL;
1985   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990 {
1991   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993   PetscInt                      m, nz;
1994 
1995   PetscFunctionBegin;
1996   if (PetscDefined(USE_DEBUG)) {
1997     PetscInt  i;
1998     PetscBool flg, missing;
1999 
2000     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005   }
2006 
2007   /* Free the old stale stuff */
2008   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2009 
2010   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011      but they will not be used. Allocate them just for easy debugging.
2012    */
2013   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2014 
2015   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016   fact->factortype             = MAT_FACTOR_ICC;
2017   fact->info.factor_mallocs    = 0;
2018   fact->info.fill_ratio_given  = info->fill;
2019   fact->info.fill_ratio_needed = 1.0;
2020 
2021   aij->row = NULL;
2022   aij->col = NULL;
2023 
2024   /* ====================================================================== */
2025   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026   /* We'll do in-place factorization on fact                                */
2027   /* ====================================================================== */
2028   const int *Ai, *Aj;
2029 
2030   m  = fact->rmap->n;
2031   nz = aij->nz;
2032 
2033   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2039 
2040   /* ====================================================================== */
2041   /* Create mat descriptors for M, L                                        */
2042   /* ====================================================================== */
2043   cusparseFillMode_t fillMode;
2044   cusparseDiagType_t diagType;
2045 
2046   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2049 
2050   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055   */
2056   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2061 
2062   /* ========================================================================= */
2063   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064   /* ========================================================================= */
2065   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2067 
2068   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2070 
2071   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2073 
2074   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2079 
2080   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082    */
2083   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085     fs->spsvBuffer_L = fs->factBuffer_M;
2086     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087   } else {
2088     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089     fs->spsvBuffer_Lt = fs->factBuffer_M;
2090     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091   }
2092 
2093   /* ========================================================================== */
2094   /* Perform analysis of ic0 on M                                               */
2095   /* The lower triangular part of M has the same sparsity pattern as L          */
2096   /* ========================================================================== */
2097   int              structural_zero;
2098   cusparseStatus_t status;
2099 
2100   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102   if (PetscDefined(USE_DEBUG)) {
2103     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106   }
2107 
2108   /* Estimate FLOPs of the numeric factorization */
2109   {
2110     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2111     PetscInt      *Ai, nzRow, nzLeft;
2112     PetscLogDouble flops = 0.0;
2113 
2114     Ai = Aseq->i;
2115     for (PetscInt i = 0; i < m; i++) {
2116       nzRow = Ai[i + 1] - Ai[i];
2117       if (nzRow > 1) {
2118         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120         */
2121         nzLeft = (nzRow - 1) / 2;
2122         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123       }
2124     }
2125     fs->numericFactFlops = flops;
2126   }
2127   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 #endif
2131 
2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133 {
2134   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2135   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2136 
2137   PetscFunctionBegin;
2138   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2139   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2140   B->offloadmask = PETSC_OFFLOAD_CPU;
2141 
2142   if (!cusparsestruct->use_cpu_solve) {
2143 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2144     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2145     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2146 #else
2147     /* determine which version of MatSolve needs to be used. */
2148     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2149     IS          isrow = b->row, iscol = b->col;
2150     PetscBool   row_identity, col_identity;
2151 
2152     PetscCall(ISIdentity(isrow, &row_identity));
2153     PetscCall(ISIdentity(iscol, &col_identity));
2154     if (row_identity && col_identity) {
2155       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2156       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2157     } else {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2160     }
2161 #endif
2162   }
2163   B->ops->matsolve          = NULL;
2164   B->ops->matsolvetranspose = NULL;
2165 
2166   /* get the triangular factors */
2167   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 
2171 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2172 {
2173   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2174 
2175   PetscFunctionBegin;
2176   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2177   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2178   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2179   PetscFunctionReturn(PETSC_SUCCESS);
2180 }
2181 
2182 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2183 {
2184   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2185 
2186   PetscFunctionBegin;
2187 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2188   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2189   if (cusparseTriFactors->factorizeOnDevice) {
2190     PetscCall(ISIdentity(isrow, &row_identity));
2191     PetscCall(ISIdentity(iscol, &col_identity));
2192   }
2193   if (!info->levels && row_identity && col_identity) {
2194     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2195   } else
2196 #endif
2197   {
2198     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2199     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2200     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2201   }
2202   PetscFunctionReturn(PETSC_SUCCESS);
2203 }
2204 
2205 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2206 {
2207   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2208 
2209   PetscFunctionBegin;
2210 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2211   PetscBool perm_identity = PETSC_FALSE;
2212   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2213   if (!info->levels && perm_identity) {
2214     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2215   } else
2216 #endif
2217   {
2218     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2219     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2220     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2221   }
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2226 {
2227   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2228 
2229   PetscFunctionBegin;
2230   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2231   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2232   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2233   PetscFunctionReturn(PETSC_SUCCESS);
2234 }
2235 
2236 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2237 {
2238   PetscFunctionBegin;
2239   *type = MATSOLVERCUSPARSE;
2240   PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242 
2243 /*MC
2244   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2245   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2246   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2247   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2248   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2249   algorithms are not recommended. This class does NOT support direct solver operations.
2250 
2251   Level: beginner
2252 
2253 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2254           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2255 M*/
2256 
2257 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2258 {
2259   PetscInt  n = A->rmap->n;
2260   PetscBool factOnDevice, factOnHost;
2261   char     *prefix;
2262   char      factPlace[32] = "device"; /* the default */
2263 
2264   PetscFunctionBegin;
2265   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266   PetscCall(MatSetSizes(*B, n, n, n, n));
2267   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2269 
2270   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2271   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2272   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2273   PetscOptionsEnd();
2274   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2275   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2276   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2277   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2278 
2279   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2280   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2281     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2282     if (!A->boundtocpu) {
2283       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2284       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2285     } else {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2288     }
2289     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2290     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2291     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2292   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2293     if (!A->boundtocpu) {
2294       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2295       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2296     } else {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2299     }
2300     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2301     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2302   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2303 
2304   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2305   (*B)->canuseordering = PETSC_TRUE;
2306   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2307   PetscFunctionReturn(PETSC_SUCCESS);
2308 }
2309 
2310 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2311 {
2312   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2313   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2315   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2316 #endif
2317 
2318   PetscFunctionBegin;
2319   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2320     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2321     if (A->factortype == MAT_FACTOR_NONE) {
2322       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2323       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2324     }
2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326     else if (fs->csrVal) {
2327       /* We have a factorized matrix on device and are able to copy it to host */
2328       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329     }
2330 #endif
2331     else
2332       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2333     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2334     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2335     A->offloadmask = PETSC_OFFLOAD_BOTH;
2336   }
2337   PetscFunctionReturn(PETSC_SUCCESS);
2338 }
2339 
2340 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2341 {
2342   PetscFunctionBegin;
2343   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2344   *array = ((Mat_SeqAIJ *)A->data)->a;
2345   PetscFunctionReturn(PETSC_SUCCESS);
2346 }
2347 
2348 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2349 {
2350   PetscFunctionBegin;
2351   A->offloadmask = PETSC_OFFLOAD_CPU;
2352   *array         = NULL;
2353   PetscFunctionReturn(PETSC_SUCCESS);
2354 }
2355 
2356 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2357 {
2358   PetscFunctionBegin;
2359   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2360   *array = ((Mat_SeqAIJ *)A->data)->a;
2361   PetscFunctionReturn(PETSC_SUCCESS);
2362 }
2363 
2364 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2365 {
2366   PetscFunctionBegin;
2367   *array = NULL;
2368   PetscFunctionReturn(PETSC_SUCCESS);
2369 }
2370 
2371 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2372 {
2373   PetscFunctionBegin;
2374   *array = ((Mat_SeqAIJ *)A->data)->a;
2375   PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377 
2378 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379 {
2380   PetscFunctionBegin;
2381   A->offloadmask = PETSC_OFFLOAD_CPU;
2382   *array         = NULL;
2383   PetscFunctionReturn(PETSC_SUCCESS);
2384 }
2385 
2386 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2387 {
2388   Mat_SeqAIJCUSPARSE *cusp;
2389   CsrMatrix          *matrix;
2390 
2391   PetscFunctionBegin;
2392   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2393   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2394   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2395   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2396   matrix = (CsrMatrix *)cusp->mat->mat;
2397 
2398   if (i) {
2399 #if !defined(PETSC_USE_64BIT_INDICES)
2400     *i = matrix->row_offsets->data().get();
2401 #else
2402     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2403 #endif
2404   }
2405   if (j) {
2406 #if !defined(PETSC_USE_64BIT_INDICES)
2407     *j = matrix->column_indices->data().get();
2408 #else
2409     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410 #endif
2411   }
2412   if (a) *a = matrix->values->data().get();
2413   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2414   PetscFunctionReturn(PETSC_SUCCESS);
2415 }
2416 
2417 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2418 {
2419   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2420   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2421   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2422   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2423   cusparseStatus_t              stat;
2424   PetscBool                     both = PETSC_TRUE;
2425 
2426   PetscFunctionBegin;
2427   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2428   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2429     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2430       CsrMatrix *matrix;
2431       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2432 
2433       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2434       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2435       matrix->values->assign(a->a, a->a + a->nz);
2436       PetscCallCUDA(WaitForCUDA());
2437       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2438       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2439       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2440     } else {
2441       PetscInt nnz;
2442       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2443       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2444       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2445       delete cusparsestruct->workVector;
2446       delete cusparsestruct->rowoffsets_gpu;
2447       cusparsestruct->workVector     = NULL;
2448       cusparsestruct->rowoffsets_gpu = NULL;
2449       try {
2450         if (a->compressedrow.use) {
2451           m    = a->compressedrow.nrows;
2452           ii   = a->compressedrow.i;
2453           ridx = a->compressedrow.rindex;
2454         } else {
2455           m    = A->rmap->n;
2456           ii   = a->i;
2457           ridx = NULL;
2458         }
2459         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2460         if (!a->a) {
2461           nnz  = ii[m];
2462           both = PETSC_FALSE;
2463         } else nnz = a->nz;
2464         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2465 
2466         /* create cusparse matrix */
2467         cusparsestruct->nrows = m;
2468         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2469         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2470         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2471         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2472 
2473         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2474         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2475         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2480 
2481         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2482         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2483           /* set the matrix */
2484           CsrMatrix *mat   = new CsrMatrix;
2485           mat->num_rows    = m;
2486           mat->num_cols    = A->cmap->n;
2487           mat->num_entries = nnz;
2488           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2489           mat->row_offsets->assign(ii, ii + m + 1);
2490 
2491           mat->column_indices = new THRUSTINTARRAY32(nnz);
2492           mat->column_indices->assign(a->j, a->j + nnz);
2493 
2494           mat->values = new THRUSTARRAY(nnz);
2495           if (a->a) mat->values->assign(a->a, a->a + nnz);
2496 
2497           /* assign the pointer */
2498           matstruct->mat = mat;
2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503             PetscCallCUSPARSE(stat);
2504           }
2505 #endif
2506         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509 #else
2510           CsrMatrix *mat   = new CsrMatrix;
2511           mat->num_rows    = m;
2512           mat->num_cols    = A->cmap->n;
2513           mat->num_entries = nnz;
2514           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2515           mat->row_offsets->assign(ii, ii + m + 1);
2516 
2517           mat->column_indices = new THRUSTINTARRAY32(nnz);
2518           mat->column_indices->assign(a->j, a->j + nnz);
2519 
2520           mat->values = new THRUSTARRAY(nnz);
2521           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522 
2523           cusparseHybMat_t hybMat;
2524           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527           PetscCallCUSPARSE(stat);
2528           /* assign the pointer */
2529           matstruct->mat = hybMat;
2530 
2531           if (mat) {
2532             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535             delete (CsrMatrix *)mat;
2536           }
2537 #endif
2538         }
2539 
2540         /* assign the compressed row indices */
2541         if (a->compressedrow.use) {
2542           cusparsestruct->workVector = new THRUSTARRAY(m);
2543           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2544           matstruct->cprowIndices->assign(ridx, ridx + m);
2545           tmp = m;
2546         } else {
2547           cusparsestruct->workVector = NULL;
2548           matstruct->cprowIndices    = NULL;
2549           tmp                        = 0;
2550         }
2551         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2552 
2553         /* assign the pointer */
2554         cusparsestruct->mat = matstruct;
2555       } catch (char *ex) {
2556         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557       }
2558       PetscCallCUDA(WaitForCUDA());
2559       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560       cusparsestruct->nonzerostate = A->nonzerostate;
2561     }
2562     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563   }
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 struct VecCUDAPlusEquals {
2568   template <typename Tuple>
2569   __host__ __device__ void operator()(Tuple t)
2570   {
2571     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572   }
2573 };
2574 
2575 struct VecCUDAEquals {
2576   template <typename Tuple>
2577   __host__ __device__ void operator()(Tuple t)
2578   {
2579     thrust::get<1>(t) = thrust::get<0>(t);
2580   }
2581 };
2582 
2583 struct VecCUDAEqualsReverse {
2584   template <typename Tuple>
2585   __host__ __device__ void operator()(Tuple t)
2586   {
2587     thrust::get<0>(t) = thrust::get<1>(t);
2588   }
2589 };
2590 
2591 struct MatMatCusparse {
2592   PetscBool      cisdense;
2593   PetscScalar   *Bt;
2594   Mat            X;
2595   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596   PetscLogDouble flops;
2597   CsrMatrix     *Bcsr;
2598 
2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600   cusparseSpMatDescr_t matSpBDescr;
2601   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602   cusparseDnMatDescr_t matBDescr;
2603   cusparseDnMatDescr_t matCDescr;
2604   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606   void *dBuffer4;
2607   void *dBuffer5;
2608   #endif
2609   size_t                mmBufferSize;
2610   void                 *mmBuffer;
2611   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612   cusparseSpGEMMDescr_t spgemmDesc;
2613 #endif
2614 };
2615 
2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617 {
2618   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2619 
2620   PetscFunctionBegin;
2621   PetscCallCUDA(cudaFree(mmdata->Bt));
2622   delete mmdata->Bcsr;
2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631   #endif
2632   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634 #endif
2635   PetscCall(MatDestroy(&mmdata->X));
2636   PetscCall(PetscFree(data));
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2641 
2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643 {
2644   Mat_Product                  *product = C->product;
2645   Mat                           A, B;
2646   PetscInt                      m, n, blda, clda;
2647   PetscBool                     flg, biscuda;
2648   Mat_SeqAIJCUSPARSE           *cusp;
2649   cusparseStatus_t              stat;
2650   cusparseOperation_t           opA;
2651   const PetscScalar            *barray;
2652   PetscScalar                  *carray;
2653   MatMatCusparse               *mmdata;
2654   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655   CsrMatrix                    *csrmat;
2656 
2657   PetscFunctionBegin;
2658   MatCheckProduct(C, 1);
2659   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660   mmdata = (MatMatCusparse *)product->data;
2661   A      = product->A;
2662   B      = product->B;
2663   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670   switch (product->type) {
2671   case MATPRODUCT_AB:
2672   case MATPRODUCT_PtAP:
2673     mat = cusp->mat;
2674     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675     m   = A->rmap->n;
2676     n   = B->cmap->n;
2677     break;
2678   case MATPRODUCT_AtB:
2679     if (!A->form_explicit_transpose) {
2680       mat = cusp->mat;
2681       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682     } else {
2683       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684       mat = cusp->matTranspose;
2685       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686     }
2687     m = A->cmap->n;
2688     n = B->cmap->n;
2689     break;
2690   case MATPRODUCT_ABt:
2691   case MATPRODUCT_RARt:
2692     mat = cusp->mat;
2693     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694     m   = A->rmap->n;
2695     n   = B->rmap->n;
2696     break;
2697   default:
2698     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699   }
2700   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701   csrmat = (CsrMatrix *)mat->mat;
2702   /* if the user passed a CPU matrix, copy the data to the GPU */
2703   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2706 
2707   PetscCall(MatDenseGetLDA(B, &blda));
2708   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711   } else {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(C, &clda));
2714   }
2715 
2716   PetscCall(PetscLogGpuTimeBegin());
2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2720   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2721     size_t mmBufferSize;
2722     if (mmdata->initialized && mmdata->Blda != blda) {
2723       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2724       mmdata->matBDescr = NULL;
2725     }
2726     if (!mmdata->matBDescr) {
2727       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2728       mmdata->Blda = blda;
2729     }
2730 
2731     if (mmdata->initialized && mmdata->Clda != clda) {
2732       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2733       mmdata->matCDescr = NULL;
2734     }
2735     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2736       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737       mmdata->Clda = clda;
2738     }
2739 
2740     if (!mat->matDescr) {
2741       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2742                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2743       PetscCallCUSPARSE(stat);
2744     }
2745     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2746     PetscCallCUSPARSE(stat);
2747     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2748       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2749       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2750       mmdata->mmBufferSize = mmBufferSize;
2751     }
2752     mmdata->initialized = PETSC_TRUE;
2753   } else {
2754     /* to be safe, always update pointers of the mats */
2755     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2756     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2757     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2758   }
2759 
2760   /* do cusparseSpMM, which supports transpose on B */
2761   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2762   PetscCallCUSPARSE(stat);
2763 #else
2764   PetscInt k;
2765   /* cusparseXcsrmm does not support transpose on B */
2766   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2767     cublasHandle_t cublasv2handle;
2768     cublasStatus_t cerr;
2769 
2770     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2771     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2772     PetscCallCUBLAS(cerr);
2773     blda = B->cmap->n;
2774     k    = B->cmap->n;
2775   } else {
2776     k = B->rmap->n;
2777   }
2778 
2779   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2780   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2781   PetscCallCUSPARSE(stat);
2782 #endif
2783   PetscCall(PetscLogGpuTimeEnd());
2784   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2785   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2786   if (product->type == MATPRODUCT_RARt) {
2787     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2788     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2789   } else if (product->type == MATPRODUCT_PtAP) {
2790     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2792   } else {
2793     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2794   }
2795   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2796   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2797   PetscFunctionReturn(PETSC_SUCCESS);
2798 }
2799 
2800 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2801 {
2802   Mat_Product        *product = C->product;
2803   Mat                 A, B;
2804   PetscInt            m, n;
2805   PetscBool           cisdense, flg;
2806   MatMatCusparse     *mmdata;
2807   Mat_SeqAIJCUSPARSE *cusp;
2808 
2809   PetscFunctionBegin;
2810   MatCheckProduct(C, 1);
2811   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2812   A = product->A;
2813   B = product->B;
2814   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2815   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2816   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2817   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2818   switch (product->type) {
2819   case MATPRODUCT_AB:
2820     m = A->rmap->n;
2821     n = B->cmap->n;
2822     break;
2823   case MATPRODUCT_AtB:
2824     m = A->cmap->n;
2825     n = B->cmap->n;
2826     break;
2827   case MATPRODUCT_ABt:
2828     m = A->rmap->n;
2829     n = B->rmap->n;
2830     break;
2831   case MATPRODUCT_PtAP:
2832     m = B->cmap->n;
2833     n = B->cmap->n;
2834     break;
2835   case MATPRODUCT_RARt:
2836     m = B->rmap->n;
2837     n = B->rmap->n;
2838     break;
2839   default:
2840     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2841   }
2842   PetscCall(MatSetSizes(C, m, n, m, n));
2843   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2844   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2845   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2846 
2847   /* product data */
2848   PetscCall(PetscNew(&mmdata));
2849   mmdata->cisdense = cisdense;
2850 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2851   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2852   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2853 #endif
2854   /* for these products we need intermediate storage */
2855   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2856     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2857     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2858     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2859       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2860     } else {
2861       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2862     }
2863   }
2864   C->product->data    = mmdata;
2865   C->product->destroy = MatDestroy_MatMatCusparse;
2866 
2867   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2868   PetscFunctionReturn(PETSC_SUCCESS);
2869 }
2870 
2871 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2872 {
2873   Mat_Product                  *product = C->product;
2874   Mat                           A, B;
2875   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2876   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2877   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2878   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2879   PetscBool                     flg;
2880   cusparseStatus_t              stat;
2881   MatProductType                ptype;
2882   MatMatCusparse               *mmdata;
2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884   cusparseSpMatDescr_t BmatSpDescr;
2885 #endif
2886   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2887 
2888   PetscFunctionBegin;
2889   MatCheckProduct(C, 1);
2890   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2891   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2892   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2893   mmdata = (MatMatCusparse *)C->product->data;
2894   A      = product->A;
2895   B      = product->B;
2896   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2897     mmdata->reusesym = PETSC_FALSE;
2898     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2899     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2900     Cmat = Ccusp->mat;
2901     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2902     Ccsr = (CsrMatrix *)Cmat->mat;
2903     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2904     goto finalize;
2905   }
2906   if (!c->nz) goto finalize;
2907   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2908   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2909   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2910   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2911   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2913   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2914   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2915   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2916   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2919   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2920   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2921 
2922   ptype = product->type;
2923   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2924     ptype = MATPRODUCT_AB;
2925     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2926   }
2927   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2928     ptype = MATPRODUCT_AB;
2929     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2930   }
2931   switch (ptype) {
2932   case MATPRODUCT_AB:
2933     Amat = Acusp->mat;
2934     Bmat = Bcusp->mat;
2935     break;
2936   case MATPRODUCT_AtB:
2937     Amat = Acusp->matTranspose;
2938     Bmat = Bcusp->mat;
2939     break;
2940   case MATPRODUCT_ABt:
2941     Amat = Acusp->mat;
2942     Bmat = Bcusp->matTranspose;
2943     break;
2944   default:
2945     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2946   }
2947   Cmat = Ccusp->mat;
2948   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2949   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2950   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2951   Acsr = (CsrMatrix *)Amat->mat;
2952   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2953   Ccsr = (CsrMatrix *)Cmat->mat;
2954   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2955   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2956   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957   PetscCall(PetscLogGpuTimeBegin());
2958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2959   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2960   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2961   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2962   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2963   PetscCallCUSPARSE(stat);
2964   #else
2965   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2966   PetscCallCUSPARSE(stat);
2967   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2968   PetscCallCUSPARSE(stat);
2969   #endif
2970 #else
2971   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2972                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2973   PetscCallCUSPARSE(stat);
2974 #endif
2975   PetscCall(PetscLogGpuFlops(mmdata->flops));
2976   PetscCallCUDA(WaitForCUDA());
2977   PetscCall(PetscLogGpuTimeEnd());
2978   C->offloadmask = PETSC_OFFLOAD_GPU;
2979 finalize:
2980   /* shorter version of MatAssemblyEnd_SeqAIJ */
2981   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2982   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2983   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2984   c->reallocs = 0;
2985   C->info.mallocs += 0;
2986   C->info.nz_unneeded = 0;
2987   C->assembled = C->was_assembled = PETSC_TRUE;
2988   C->num_ass++;
2989   PetscFunctionReturn(PETSC_SUCCESS);
2990 }
2991 
2992 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2993 {
2994   Mat_Product                  *product = C->product;
2995   Mat                           A, B;
2996   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2997   Mat_SeqAIJ                   *a, *b, *c;
2998   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2999   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3000   PetscInt                      i, j, m, n, k;
3001   PetscBool                     flg;
3002   cusparseStatus_t              stat;
3003   MatProductType                ptype;
3004   MatMatCusparse               *mmdata;
3005   PetscLogDouble                flops;
3006   PetscBool                     biscompressed, ciscompressed;
3007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3008   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3009   cusparseSpMatDescr_t BmatSpDescr;
3010 #else
3011   int cnz;
3012 #endif
3013   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3014 
3015   PetscFunctionBegin;
3016   MatCheckProduct(C, 1);
3017   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3018   A = product->A;
3019   B = product->B;
3020   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3021   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3022   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3023   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3024   a = (Mat_SeqAIJ *)A->data;
3025   b = (Mat_SeqAIJ *)B->data;
3026   /* product data */
3027   PetscCall(PetscNew(&mmdata));
3028   C->product->data    = mmdata;
3029   C->product->destroy = MatDestroy_MatMatCusparse;
3030 
3031   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3032   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3033   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3034   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3035   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3037 
3038   ptype = product->type;
3039   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3040     ptype                                          = MATPRODUCT_AB;
3041     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3042   }
3043   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3044     ptype                                          = MATPRODUCT_AB;
3045     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3046   }
3047   biscompressed = PETSC_FALSE;
3048   ciscompressed = PETSC_FALSE;
3049   switch (ptype) {
3050   case MATPRODUCT_AB:
3051     m    = A->rmap->n;
3052     n    = B->cmap->n;
3053     k    = A->cmap->n;
3054     Amat = Acusp->mat;
3055     Bmat = Bcusp->mat;
3056     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3057     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3058     break;
3059   case MATPRODUCT_AtB:
3060     m = A->cmap->n;
3061     n = B->cmap->n;
3062     k = A->rmap->n;
3063     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3064     Amat = Acusp->matTranspose;
3065     Bmat = Bcusp->mat;
3066     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3067     break;
3068   case MATPRODUCT_ABt:
3069     m = A->rmap->n;
3070     n = B->rmap->n;
3071     k = A->cmap->n;
3072     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3073     Amat = Acusp->mat;
3074     Bmat = Bcusp->matTranspose;
3075     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3076     break;
3077   default:
3078     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3079   }
3080 
3081   /* create cusparse matrix */
3082   PetscCall(MatSetSizes(C, m, n, m, n));
3083   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3084   c     = (Mat_SeqAIJ *)C->data;
3085   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3086   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3087   Ccsr  = new CsrMatrix;
3088 
3089   c->compressedrow.use = ciscompressed;
3090   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3091     c->compressedrow.nrows = a->compressedrow.nrows;
3092     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3093     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3094     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3095     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3096     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3097   } else {
3098     c->compressedrow.nrows  = 0;
3099     c->compressedrow.i      = NULL;
3100     c->compressedrow.rindex = NULL;
3101     Ccusp->workVector       = NULL;
3102     Cmat->cprowIndices      = NULL;
3103   }
3104   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3105   Ccusp->mat        = Cmat;
3106   Ccusp->mat->mat   = Ccsr;
3107   Ccsr->num_rows    = Ccusp->nrows;
3108   Ccsr->num_cols    = n;
3109   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3110   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3111   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3112   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3113   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3114   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3115   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3116   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3119   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3120     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3121     c->nz                = 0;
3122     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3123     Ccsr->values         = new THRUSTARRAY(c->nz);
3124     goto finalizesym;
3125   }
3126 
3127   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3128   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3129   Acsr = (CsrMatrix *)Amat->mat;
3130   if (!biscompressed) {
3131     Bcsr = (CsrMatrix *)Bmat->mat;
3132 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3133     BmatSpDescr = Bmat->matDescr;
3134 #endif
3135   } else { /* we need to use row offsets for the full matrix */
3136     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3137     Bcsr                 = new CsrMatrix;
3138     Bcsr->num_rows       = B->rmap->n;
3139     Bcsr->num_cols       = cBcsr->num_cols;
3140     Bcsr->num_entries    = cBcsr->num_entries;
3141     Bcsr->column_indices = cBcsr->column_indices;
3142     Bcsr->values         = cBcsr->values;
3143     if (!Bcusp->rowoffsets_gpu) {
3144       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3145       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3146       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3147     }
3148     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3149     mmdata->Bcsr      = Bcsr;
3150 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151     if (Bcsr->num_rows && Bcsr->num_cols) {
3152       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3153       PetscCallCUSPARSE(stat);
3154     }
3155     BmatSpDescr = mmdata->matSpBDescr;
3156 #endif
3157   }
3158   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3159   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3160   /* precompute flops count */
3161   if (ptype == MATPRODUCT_AB) {
3162     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3163       const PetscInt st = a->i[i];
3164       const PetscInt en = a->i[i + 1];
3165       for (j = st; j < en; j++) {
3166         const PetscInt brow = a->j[j];
3167         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3168       }
3169     }
3170   } else if (ptype == MATPRODUCT_AtB) {
3171     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3172       const PetscInt anzi = a->i[i + 1] - a->i[i];
3173       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3174       flops += (2. * anzi) * bnzi;
3175     }
3176   } else { /* TODO */
3177     flops = 0.;
3178   }
3179 
3180   mmdata->flops = flops;
3181   PetscCall(PetscLogGpuTimeBegin());
3182 
3183 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3185   // cuda-12.2 requires non-null csrRowOffsets
3186   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3187   PetscCallCUSPARSE(stat);
3188   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3189   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3190   {
3191     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3192      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3193   */
3194     void *dBuffer1 = NULL;
3195     void *dBuffer2 = NULL;
3196     void *dBuffer3 = NULL;
3197     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3198     size_t bufferSize1 = 0;
3199     size_t bufferSize2 = 0;
3200     size_t bufferSize3 = 0;
3201     size_t bufferSize4 = 0;
3202     size_t bufferSize5 = 0;
3203 
3204     /* ask bufferSize1 bytes for external memory */
3205     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3206     PetscCallCUSPARSE(stat);
3207     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3208     /* inspect the matrices A and B to understand the memory requirement for the next step */
3209     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3210     PetscCallCUSPARSE(stat);
3211 
3212     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3213     PetscCallCUSPARSE(stat);
3214     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3215     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3216     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3217     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3218     PetscCallCUSPARSE(stat);
3219     PetscCallCUDA(cudaFree(dBuffer1));
3220     PetscCallCUDA(cudaFree(dBuffer2));
3221 
3222     /* get matrix C non-zero entries C_nnz1 */
3223     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3224     c->nz = (PetscInt)C_nnz1;
3225     /* allocate matrix C */
3226     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3227     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3228     Ccsr->values = new THRUSTARRAY(c->nz);
3229     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3230     /* update matC with the new pointers */
3231     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3232     PetscCallCUSPARSE(stat);
3233 
3234     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3235     PetscCallCUSPARSE(stat);
3236     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3237     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3238     PetscCallCUSPARSE(stat);
3239     PetscCallCUDA(cudaFree(dBuffer3));
3240     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3241     PetscCallCUSPARSE(stat);
3242     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3243   }
3244   #else
3245   size_t bufSize2;
3246   /* ask bufferSize bytes for external memory */
3247   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3248   PetscCallCUSPARSE(stat);
3249   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3250   /* inspect the matrices A and B to understand the memory requirement for the next step */
3251   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3252   PetscCallCUSPARSE(stat);
3253   /* ask bufferSize again bytes for external memory */
3254   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3255   PetscCallCUSPARSE(stat);
3256   /* The CUSPARSE documentation is not clear, nor the API
3257      We need both buffers to perform the operations properly!
3258      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3259      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3260      is stored in the descriptor! What a messy API... */
3261   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3262   /* compute the intermediate product of A * B */
3263   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3264   PetscCallCUSPARSE(stat);
3265   /* get matrix C non-zero entries C_nnz1 */
3266   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3267   c->nz = (PetscInt)C_nnz1;
3268   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3269                       mmdata->mmBufferSize / 1024));
3270   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3271   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3272   Ccsr->values = new THRUSTARRAY(c->nz);
3273   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3274   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3275   PetscCallCUSPARSE(stat);
3276   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3277   PetscCallCUSPARSE(stat);
3278   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3279 #else
3280   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3281   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3282                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3283   PetscCallCUSPARSE(stat);
3284   c->nz                = cnz;
3285   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3286   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3287   Ccsr->values = new THRUSTARRAY(c->nz);
3288   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3289 
3290   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3291   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3292      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3293      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3294   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3295                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3296   PetscCallCUSPARSE(stat);
3297 #endif
3298   PetscCall(PetscLogGpuFlops(mmdata->flops));
3299   PetscCall(PetscLogGpuTimeEnd());
3300 finalizesym:
3301   c->singlemalloc = PETSC_FALSE;
3302   c->free_a       = PETSC_TRUE;
3303   c->free_ij      = PETSC_TRUE;
3304   PetscCall(PetscMalloc1(m + 1, &c->i));
3305   PetscCall(PetscMalloc1(c->nz, &c->j));
3306   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3307     PetscInt      *d_i = c->i;
3308     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3309     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3310     ii = *Ccsr->row_offsets;
3311     jj = *Ccsr->column_indices;
3312     if (ciscompressed) d_i = c->compressedrow.i;
3313     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3314     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3315   } else {
3316     PetscInt *d_i = c->i;
3317     if (ciscompressed) d_i = c->compressedrow.i;
3318     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3319     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3320   }
3321   if (ciscompressed) { /* need to expand host row offsets */
3322     PetscInt r = 0;
3323     c->i[0]    = 0;
3324     for (k = 0; k < c->compressedrow.nrows; k++) {
3325       const PetscInt next = c->compressedrow.rindex[k];
3326       const PetscInt old  = c->compressedrow.i[k];
3327       for (; r < next; r++) c->i[r + 1] = old;
3328     }
3329     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3330   }
3331   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3332   PetscCall(PetscMalloc1(m, &c->ilen));
3333   PetscCall(PetscMalloc1(m, &c->imax));
3334   c->maxnz         = c->nz;
3335   c->nonzerorowcnt = 0;
3336   c->rmax          = 0;
3337   for (k = 0; k < m; k++) {
3338     const PetscInt nn = c->i[k + 1] - c->i[k];
3339     c->ilen[k] = c->imax[k] = nn;
3340     c->nonzerorowcnt += (PetscInt) !!nn;
3341     c->rmax = PetscMax(c->rmax, nn);
3342   }
3343   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3344   PetscCall(PetscMalloc1(c->nz, &c->a));
3345   Ccsr->num_entries = c->nz;
3346 
3347   C->nonzerostate++;
3348   PetscCall(PetscLayoutSetUp(C->rmap));
3349   PetscCall(PetscLayoutSetUp(C->cmap));
3350   Ccusp->nonzerostate = C->nonzerostate;
3351   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3352   C->preallocated     = PETSC_TRUE;
3353   C->assembled        = PETSC_FALSE;
3354   C->was_assembled    = PETSC_FALSE;
3355   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3356     mmdata->reusesym = PETSC_TRUE;
3357     C->offloadmask   = PETSC_OFFLOAD_GPU;
3358   }
3359   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3360   PetscFunctionReturn(PETSC_SUCCESS);
3361 }
3362 
3363 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3364 
3365 /* handles sparse or dense B */
3366 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3367 {
3368   Mat_Product *product = mat->product;
3369   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3370 
3371   PetscFunctionBegin;
3372   MatCheckProduct(mat, 1);
3373   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3374   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3375   if (product->type == MATPRODUCT_ABC) {
3376     Ciscusp = PETSC_FALSE;
3377     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3378   }
3379   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3380     PetscBool usecpu = PETSC_FALSE;
3381     switch (product->type) {
3382     case MATPRODUCT_AB:
3383       if (product->api_user) {
3384         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3385         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3386         PetscOptionsEnd();
3387       } else {
3388         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3389         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3390         PetscOptionsEnd();
3391       }
3392       break;
3393     case MATPRODUCT_AtB:
3394       if (product->api_user) {
3395         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3396         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3397         PetscOptionsEnd();
3398       } else {
3399         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3400         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3401         PetscOptionsEnd();
3402       }
3403       break;
3404     case MATPRODUCT_PtAP:
3405       if (product->api_user) {
3406         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3407         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3408         PetscOptionsEnd();
3409       } else {
3410         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3411         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3412         PetscOptionsEnd();
3413       }
3414       break;
3415     case MATPRODUCT_RARt:
3416       if (product->api_user) {
3417         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3418         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3419         PetscOptionsEnd();
3420       } else {
3421         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3422         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3423         PetscOptionsEnd();
3424       }
3425       break;
3426     case MATPRODUCT_ABC:
3427       if (product->api_user) {
3428         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3429         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3430         PetscOptionsEnd();
3431       } else {
3432         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3433         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3434         PetscOptionsEnd();
3435       }
3436       break;
3437     default:
3438       break;
3439     }
3440     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3441   }
3442   /* dispatch */
3443   if (isdense) {
3444     switch (product->type) {
3445     case MATPRODUCT_AB:
3446     case MATPRODUCT_AtB:
3447     case MATPRODUCT_ABt:
3448     case MATPRODUCT_PtAP:
3449     case MATPRODUCT_RARt:
3450       if (product->A->boundtocpu) {
3451         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3452       } else {
3453         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3454       }
3455       break;
3456     case MATPRODUCT_ABC:
3457       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3458       break;
3459     default:
3460       break;
3461     }
3462   } else if (Biscusp && Ciscusp) {
3463     switch (product->type) {
3464     case MATPRODUCT_AB:
3465     case MATPRODUCT_AtB:
3466     case MATPRODUCT_ABt:
3467       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3468       break;
3469     case MATPRODUCT_PtAP:
3470     case MATPRODUCT_RARt:
3471     case MATPRODUCT_ABC:
3472       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3473       break;
3474     default:
3475       break;
3476     }
3477   } else { /* fallback for AIJ */
3478     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3479   }
3480   PetscFunctionReturn(PETSC_SUCCESS);
3481 }
3482 
3483 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3484 {
3485   PetscFunctionBegin;
3486   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3487   PetscFunctionReturn(PETSC_SUCCESS);
3488 }
3489 
3490 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3491 {
3492   PetscFunctionBegin;
3493   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3494   PetscFunctionReturn(PETSC_SUCCESS);
3495 }
3496 
3497 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3498 {
3499   PetscFunctionBegin;
3500   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3501   PetscFunctionReturn(PETSC_SUCCESS);
3502 }
3503 
3504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3505 {
3506   PetscFunctionBegin;
3507   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3508   PetscFunctionReturn(PETSC_SUCCESS);
3509 }
3510 
3511 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3512 {
3513   PetscFunctionBegin;
3514   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3515   PetscFunctionReturn(PETSC_SUCCESS);
3516 }
3517 
3518 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3519 {
3520   int i = blockIdx.x * blockDim.x + threadIdx.x;
3521   if (i < n) y[idx[i]] += x[i];
3522 }
3523 
3524 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3525 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3526 {
3527   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3528   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3529   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3530   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3531   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3532   PetscBool                     compressed;
3533 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3534   PetscInt nx, ny;
3535 #endif
3536 
3537   PetscFunctionBegin;
3538   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3539   if (!a->nz) {
3540     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3541     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3542     PetscFunctionReturn(PETSC_SUCCESS);
3543   }
3544   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3545   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3546   if (!trans) {
3547     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3548     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3549   } else {
3550     if (herm || !A->form_explicit_transpose) {
3551       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3552       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3553     } else {
3554       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3555       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3556     }
3557   }
3558   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3559   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3560 
3561   try {
3562     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3563     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3564     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3565 
3566     PetscCall(PetscLogGpuTimeBegin());
3567     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3568       /* z = A x + beta y.
3569          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3570          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3571       */
3572       xptr = xarray;
3573       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3574       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3575 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3576       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3577           allocated to accommodate different uses. So we get the length info directly from mat.
3578        */
3579       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3580         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3581         nx             = mat->num_cols;
3582         ny             = mat->num_rows;
3583       }
3584 #endif
3585     } else {
3586       /* z = A^T x + beta y
3587          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3588          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3589        */
3590       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3591       dptr = zarray;
3592       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3593       if (compressed) { /* Scatter x to work vector */
3594         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3595 
3596         thrust::for_each(
3597 #if PetscDefined(HAVE_THRUST_ASYNC)
3598           thrust::cuda::par.on(PetscDefaultCudaStream),
3599 #endif
3600           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3601           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3602       }
3603 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3604       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3605         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3606         nx             = mat->num_rows;
3607         ny             = mat->num_cols;
3608       }
3609 #endif
3610     }
3611 
3612     /* csr_spmv does y = alpha op(A) x + beta y */
3613     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3614 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3615       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3616       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3617         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3618         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3619         PetscCallCUSPARSE(
3620           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3621         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3622 
3623         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3624       } else {
3625         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3626         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3627         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3628       }
3629 
3630       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3631                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3632 #else
3633       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3635 #endif
3636     } else {
3637       if (cusparsestruct->nrows) {
3638 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3639         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3640 #else
3641         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3642         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3643 #endif
3644       }
3645     }
3646     PetscCall(PetscLogGpuTimeEnd());
3647 
3648     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3649       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3650         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3651           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3652         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3653           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3654         }
3655       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3656         PetscCall(VecSeq_CUDA::Set(zz, 0));
3657       }
3658 
3659       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3660       if (compressed) {
3661         PetscCall(PetscLogGpuTimeBegin());
3662         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3663            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3664            prevent that. So I just add a ScatterAdd kernel.
3665          */
3666 #if 0
3667         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3668         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3669                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3670                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3671                          VecCUDAPlusEquals());
3672 #else
3673         PetscInt n = matstruct->cprowIndices->size();
3674         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3675 #endif
3676         PetscCall(PetscLogGpuTimeEnd());
3677       }
3678     } else {
3679       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3680     }
3681     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3682     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3683     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3684   } catch (char *ex) {
3685     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3686   }
3687   if (yy) {
3688     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3689   } else {
3690     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3691   }
3692   PetscFunctionReturn(PETSC_SUCCESS);
3693 }
3694 
3695 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3696 {
3697   PetscFunctionBegin;
3698   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3699   PetscFunctionReturn(PETSC_SUCCESS);
3700 }
3701 
3702 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3703 {
3704   PetscObjectState    onnz = A->nonzerostate;
3705   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3706 
3707   PetscFunctionBegin;
3708   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3709   if (onnz != A->nonzerostate && cusp->deviceMat) {
3710     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3711     PetscCallCUDA(cudaFree(cusp->deviceMat));
3712     cusp->deviceMat = NULL;
3713   }
3714   PetscFunctionReturn(PETSC_SUCCESS);
3715 }
3716 
3717 /*@
3718    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3719    (the default parallel PETSc format). This matrix will ultimately pushed down
3720    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3721    assembly performance the user should preallocate the matrix storage by setting
3722    the parameter `nz` (or the array `nnz`).
3723 
3724    Collective
3725 
3726    Input Parameters:
3727 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3728 .  m - number of rows
3729 .  n - number of columns
3730 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3731 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3732 
3733    Output Parameter:
3734 .  A - the matrix
3735 
3736    Level: intermediate
3737 
3738    Notes:
3739    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3740    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3741    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3742 
3743    The AIJ format, also called
3744    compressed row storage, is fully compatible with standard Fortran
3745    storage.  That is, the stored row and column indices can begin at
3746    either one (as in Fortran) or zero.
3747 
3748    Specify the preallocated storage with either nz or nnz (not both).
3749    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3750    allocation.
3751 
3752 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3753 @*/
3754 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3755 {
3756   PetscFunctionBegin;
3757   PetscCall(MatCreate(comm, A));
3758   PetscCall(MatSetSizes(*A, m, n, m, n));
3759   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3760   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3761   PetscFunctionReturn(PETSC_SUCCESS);
3762 }
3763 
3764 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3765 {
3766   PetscFunctionBegin;
3767   if (A->factortype == MAT_FACTOR_NONE) {
3768     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3769   } else {
3770     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3771   }
3772   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3773   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3774   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3775   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3776   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3777   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3778   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3779   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3780   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3781   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3782   PetscCall(MatDestroy_SeqAIJ(A));
3783   PetscFunctionReturn(PETSC_SUCCESS);
3784 }
3785 
3786 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3787 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3788 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3789 {
3790   PetscFunctionBegin;
3791   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3792   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3793   PetscFunctionReturn(PETSC_SUCCESS);
3794 }
3795 
3796 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3797 {
3798   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3799   Mat_SeqAIJCUSPARSE *cy;
3800   Mat_SeqAIJCUSPARSE *cx;
3801   PetscScalar        *ay;
3802   const PetscScalar  *ax;
3803   CsrMatrix          *csry, *csrx;
3804 
3805   PetscFunctionBegin;
3806   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3807   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3808   if (X->ops->axpy != Y->ops->axpy) {
3809     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3810     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3811     PetscFunctionReturn(PETSC_SUCCESS);
3812   }
3813   /* if we are here, it means both matrices are bound to GPU */
3814   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3815   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3816   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3817   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3818   csry = (CsrMatrix *)cy->mat->mat;
3819   csrx = (CsrMatrix *)cx->mat->mat;
3820   /* see if we can turn this into a cublas axpy */
3821   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3822     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3823     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3824     if (eq) str = SAME_NONZERO_PATTERN;
3825   }
3826   /* spgeam is buggy with one column */
3827   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3828 
3829   if (str == SUBSET_NONZERO_PATTERN) {
3830     PetscScalar b = 1.0;
3831 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3832     size_t bufferSize;
3833     void  *buffer;
3834 #endif
3835 
3836     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3837     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3838     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3839 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3840     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3841                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3842     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3843     PetscCall(PetscLogGpuTimeBegin());
3844     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3845                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3846     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3847     PetscCall(PetscLogGpuTimeEnd());
3848     PetscCallCUDA(cudaFree(buffer));
3849 #else
3850     PetscCall(PetscLogGpuTimeBegin());
3851     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3852                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3853     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3854     PetscCall(PetscLogGpuTimeEnd());
3855 #endif
3856     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3857     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3858     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3859     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3860   } else if (str == SAME_NONZERO_PATTERN) {
3861     cublasHandle_t cublasv2handle;
3862     PetscBLASInt   one = 1, bnz = 1;
3863 
3864     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3865     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3866     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3867     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3868     PetscCall(PetscLogGpuTimeBegin());
3869     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3870     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3871     PetscCall(PetscLogGpuTimeEnd());
3872     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3873     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3874     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3875   } else {
3876     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3877     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3878   }
3879   PetscFunctionReturn(PETSC_SUCCESS);
3880 }
3881 
3882 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3883 {
3884   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3885   PetscScalar   *ay;
3886   cublasHandle_t cublasv2handle;
3887   PetscBLASInt   one = 1, bnz = 1;
3888 
3889   PetscFunctionBegin;
3890   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3891   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3892   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3893   PetscCall(PetscLogGpuTimeBegin());
3894   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3895   PetscCall(PetscLogGpuFlops(bnz));
3896   PetscCall(PetscLogGpuTimeEnd());
3897   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3898   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3899   PetscFunctionReturn(PETSC_SUCCESS);
3900 }
3901 
3902 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3903 {
3904   PetscBool   both = PETSC_FALSE;
3905   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3906 
3907   PetscFunctionBegin;
3908   if (A->factortype == MAT_FACTOR_NONE) {
3909     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3910     if (spptr->mat) {
3911       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3912       if (matrix->values) {
3913         both = PETSC_TRUE;
3914         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3915       }
3916     }
3917     if (spptr->matTranspose) {
3918       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3919       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3920     }
3921   }
3922   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3923   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3924   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3925   else A->offloadmask = PETSC_OFFLOAD_CPU;
3926   PetscFunctionReturn(PETSC_SUCCESS);
3927 }
3928 
3929 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3930 {
3931   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3932 
3933   PetscFunctionBegin;
3934   if (A->factortype != MAT_FACTOR_NONE) {
3935     A->boundtocpu = flg;
3936     PetscFunctionReturn(PETSC_SUCCESS);
3937   }
3938   if (flg) {
3939     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3940 
3941     A->ops->scale                     = MatScale_SeqAIJ;
3942     A->ops->axpy                      = MatAXPY_SeqAIJ;
3943     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3944     A->ops->mult                      = MatMult_SeqAIJ;
3945     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3946     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3947     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3948     A->ops->multhermitiantranspose    = NULL;
3949     A->ops->multhermitiantransposeadd = NULL;
3950     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3951     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3952     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3953     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3954     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3955     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3956     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3957     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3958   } else {
3959     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3960     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3961     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3962     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3963     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3964     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3965     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3966     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3967     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3968     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3969     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3970     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3971     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3972     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3973     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3974     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3975     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3976 
3977     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3978     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3979     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3980     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3981     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3982     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3983   }
3984   A->boundtocpu = flg;
3985   if (flg && a->inode.size) {
3986     a->inode.use = PETSC_TRUE;
3987   } else {
3988     a->inode.use = PETSC_FALSE;
3989   }
3990   PetscFunctionReturn(PETSC_SUCCESS);
3991 }
3992 
3993 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3994 {
3995   Mat B;
3996 
3997   PetscFunctionBegin;
3998   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3999   if (reuse == MAT_INITIAL_MATRIX) {
4000     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4001   } else if (reuse == MAT_REUSE_MATRIX) {
4002     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4003   }
4004   B = *newmat;
4005 
4006   PetscCall(PetscFree(B->defaultvectype));
4007   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4008 
4009   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4010     if (B->factortype == MAT_FACTOR_NONE) {
4011       Mat_SeqAIJCUSPARSE *spptr;
4012       PetscCall(PetscNew(&spptr));
4013       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4014       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4015       spptr->format = MAT_CUSPARSE_CSR;
4016 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4017   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4018       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4019   #else
4020       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4021   #endif
4022       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4023       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4024 #endif
4025       B->spptr = spptr;
4026     } else {
4027       Mat_SeqAIJCUSPARSETriFactors *spptr;
4028 
4029       PetscCall(PetscNew(&spptr));
4030       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4031       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4032       B->spptr = spptr;
4033     }
4034     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4035   }
4036   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4037   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4038   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4039   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4040   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4041   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4042 
4043   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4044   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4045   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4046 #if defined(PETSC_HAVE_HYPRE)
4047   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4048 #endif
4049   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4050   PetscFunctionReturn(PETSC_SUCCESS);
4051 }
4052 
4053 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4054 {
4055   PetscFunctionBegin;
4056   PetscCall(MatCreate_SeqAIJ(B));
4057   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4058   PetscFunctionReturn(PETSC_SUCCESS);
4059 }
4060 
4061 /*MC
4062    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4063 
4064    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4065    CSR, ELL, or Hybrid format.
4066    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4067 
4068    Options Database Keys:
4069 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4070 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4071                                       Other options include ell (ellpack) or hyb (hybrid).
4072 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4073 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4074 
4075   Level: beginner
4076 
4077 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4078 M*/
4079 
4080 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
4081 
4082 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4083 {
4084   PetscFunctionBegin;
4085   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
4086   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4087   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4088   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4089   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4090 
4091   PetscFunctionReturn(PETSC_SUCCESS);
4092 }
4093 
4094 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4095 {
4096   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
4097 
4098   PetscFunctionBegin;
4099   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4100   delete cusp->cooPerm;
4101   delete cusp->cooPerm_a;
4102   cusp->cooPerm   = NULL;
4103   cusp->cooPerm_a = NULL;
4104   if (cusp->use_extended_coo) {
4105     PetscCallCUDA(cudaFree(cusp->jmap_d));
4106     PetscCallCUDA(cudaFree(cusp->perm_d));
4107   }
4108   cusp->use_extended_coo = PETSC_FALSE;
4109   PetscFunctionReturn(PETSC_SUCCESS);
4110 }
4111 
4112 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4113 {
4114   PetscFunctionBegin;
4115   if (*cusparsestruct) {
4116     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
4117     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
4118     delete (*cusparsestruct)->workVector;
4119     delete (*cusparsestruct)->rowoffsets_gpu;
4120     delete (*cusparsestruct)->cooPerm;
4121     delete (*cusparsestruct)->cooPerm_a;
4122     delete (*cusparsestruct)->csr2csc_i;
4123     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4124     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4125     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4126     PetscCall(PetscFree(*cusparsestruct));
4127   }
4128   PetscFunctionReturn(PETSC_SUCCESS);
4129 }
4130 
4131 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4132 {
4133   PetscFunctionBegin;
4134   if (*mat) {
4135     delete (*mat)->values;
4136     delete (*mat)->column_indices;
4137     delete (*mat)->row_offsets;
4138     delete *mat;
4139     *mat = 0;
4140   }
4141   PetscFunctionReturn(PETSC_SUCCESS);
4142 }
4143 
4144 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4145 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4146 {
4147   PetscFunctionBegin;
4148   if (*trifactor) {
4149     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4150     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4151     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4152     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4153     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4154   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4155     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4156   #endif
4157     PetscCall(PetscFree(*trifactor));
4158   }
4159   PetscFunctionReturn(PETSC_SUCCESS);
4160 }
4161 #endif
4162 
4163 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4164 {
4165   CsrMatrix *mat;
4166 
4167   PetscFunctionBegin;
4168   if (*matstruct) {
4169     if ((*matstruct)->mat) {
4170       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4172         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4173 #else
4174         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4175         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4176 #endif
4177       } else {
4178         mat = (CsrMatrix *)(*matstruct)->mat;
4179         PetscCall(CsrMatrix_Destroy(&mat));
4180       }
4181     }
4182     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4183     delete (*matstruct)->cprowIndices;
4184     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4185     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4186     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4187 
4188 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4189     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4190     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4191     for (int i = 0; i < 3; i++) {
4192       if (mdata->cuSpMV[i].initialized) {
4193         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4194         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4195         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4196       }
4197     }
4198 #endif
4199     delete *matstruct;
4200     *matstruct = NULL;
4201   }
4202   PetscFunctionReturn(PETSC_SUCCESS);
4203 }
4204 
4205 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4206 {
4207   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4208 
4209   PetscFunctionBegin;
4210   if (fs) {
4211 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4212     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4213     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4214     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4215     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4216     delete fs->workVector;
4217     fs->workVector = NULL;
4218 #endif
4219     delete fs->rpermIndices;
4220     delete fs->cpermIndices;
4221     fs->rpermIndices = NULL;
4222     fs->cpermIndices = NULL;
4223     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
4224     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
4225     fs->init_dev_prop = PETSC_FALSE;
4226 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4227     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4228     PetscCallCUDA(cudaFree(fs->csrColIdx));
4229     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4230     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4231     PetscCallCUDA(cudaFree(fs->csrVal));
4232     PetscCallCUDA(cudaFree(fs->diag));
4233     PetscCallCUDA(cudaFree(fs->X));
4234     PetscCallCUDA(cudaFree(fs->Y));
4235     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4236     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4237     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4238     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4239     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4240     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4241     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4242     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4243     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4244     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4245     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4246     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4247     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4248     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4249     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4250     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4251     PetscCall(PetscFree(fs->csrRowPtr_h));
4252     PetscCall(PetscFree(fs->csrVal_h));
4253     PetscCall(PetscFree(fs->diag_h));
4254     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4255     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4256 #endif
4257   }
4258   PetscFunctionReturn(PETSC_SUCCESS);
4259 }
4260 
4261 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4262 {
4263   PetscFunctionBegin;
4264   if (*trifactors) {
4265     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4266     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4267     PetscCall(PetscFree(*trifactors));
4268   }
4269   PetscFunctionReturn(PETSC_SUCCESS);
4270 }
4271 
4272 struct IJCompare {
4273   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4274   {
4275     if (t1.get<0>() < t2.get<0>()) return true;
4276     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4277     return false;
4278   }
4279 };
4280 
4281 struct IJEqual {
4282   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4283   {
4284     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4285     return true;
4286   }
4287 };
4288 
4289 struct IJDiff {
4290   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
4291 };
4292 
4293 struct IJSum {
4294   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4295 };
4296 
4297 #include <thrust/iterator/discard_iterator.h>
4298 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4299 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4300 {
4301   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4302   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4303   THRUSTARRAY                          *cooPerm_v = NULL;
4304   thrust::device_ptr<const PetscScalar> d_v;
4305   CsrMatrix                            *matrix;
4306   PetscInt                              n;
4307 
4308   PetscFunctionBegin;
4309   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4310   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4311   if (!cusp->cooPerm) {
4312     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4313     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4314     PetscFunctionReturn(PETSC_SUCCESS);
4315   }
4316   matrix = (CsrMatrix *)cusp->mat->mat;
4317   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4318   if (!v) {
4319     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4320     goto finalize;
4321   }
4322   n = cusp->cooPerm->size();
4323   if (isCudaMem(v)) {
4324     d_v = thrust::device_pointer_cast(v);
4325   } else {
4326     cooPerm_v = new THRUSTARRAY(n);
4327     cooPerm_v->assign(v, v + n);
4328     d_v = cooPerm_v->data();
4329     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4330   }
4331   PetscCall(PetscLogGpuTimeBegin());
4332   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4333     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4334       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4335       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4336       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4337         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4338         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4339       */
4340       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4341       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4342       delete cooPerm_w;
4343     } else {
4344       /* all nonzeros in d_v[] are unique entries */
4345       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4346       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4347       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4348     }
4349   } else {
4350     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4351       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4352       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4353     } else {
4354       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4355       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4356       thrust::for_each(zibit, zieit, VecCUDAEquals());
4357     }
4358   }
4359   PetscCall(PetscLogGpuTimeEnd());
4360 finalize:
4361   delete cooPerm_v;
4362   A->offloadmask = PETSC_OFFLOAD_GPU;
4363   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4364   /* shorter version of MatAssemblyEnd_SeqAIJ */
4365   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4366   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4367   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4368   a->reallocs = 0;
4369   A->info.mallocs += 0;
4370   A->info.nz_unneeded = 0;
4371   A->assembled = A->was_assembled = PETSC_TRUE;
4372   A->num_ass++;
4373   PetscFunctionReturn(PETSC_SUCCESS);
4374 }
4375 
4376 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4377 {
4378   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4379 
4380   PetscFunctionBegin;
4381   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4382   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4383   if (destroy) {
4384     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4385     delete cusp->csr2csc_i;
4386     cusp->csr2csc_i = NULL;
4387   }
4388   A->transupdated = PETSC_FALSE;
4389   PetscFunctionReturn(PETSC_SUCCESS);
4390 }
4391 
4392 #include <thrust/binary_search.h>
4393 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4394 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4395 {
4396   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4397   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4398   PetscInt            cooPerm_n, nzr = 0;
4399 
4400   PetscFunctionBegin;
4401   PetscCall(PetscLayoutSetUp(A->rmap));
4402   PetscCall(PetscLayoutSetUp(A->cmap));
4403   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4404   if (n != cooPerm_n) {
4405     delete cusp->cooPerm;
4406     delete cusp->cooPerm_a;
4407     cusp->cooPerm   = NULL;
4408     cusp->cooPerm_a = NULL;
4409   }
4410   if (n) {
4411     thrust::device_ptr<PetscInt> d_i, d_j;
4412     PetscInt                    *d_raw_i, *d_raw_j;
4413     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4414     PetscMemType                 imtype, jmtype;
4415 
4416     PetscCall(PetscGetMemType(coo_i, &imtype));
4417     if (PetscMemTypeHost(imtype)) {
4418       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4419       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4420       d_i        = thrust::device_pointer_cast(d_raw_i);
4421       free_raw_i = PETSC_TRUE;
4422       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4423     } else {
4424       d_i = thrust::device_pointer_cast(coo_i);
4425     }
4426 
4427     PetscCall(PetscGetMemType(coo_j, &jmtype));
4428     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4429       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4430       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4431       d_j        = thrust::device_pointer_cast(d_raw_j);
4432       free_raw_j = PETSC_TRUE;
4433       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4434     } else {
4435       d_j = thrust::device_pointer_cast(coo_j);
4436     }
4437 
4438     THRUSTINTARRAY ii(A->rmap->n);
4439 
4440     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4441     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4442 
4443     /* Ex.
4444       n = 6
4445       coo_i = [3,3,1,4,1,4]
4446       coo_j = [3,2,2,5,2,6]
4447     */
4448     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4449     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4450 
4451     PetscCall(PetscLogGpuTimeBegin());
4452     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4453     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4454     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4455     THRUSTINTARRAY w(d_j, d_j + n);
4456 
4457     /*
4458       d_i     = [1,1,3,3,4,4]
4459       d_j     = [2,2,2,3,5,6]
4460       cooPerm = [2,4,1,0,3,5]
4461     */
4462     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4463 
4464     /*
4465       d_i     = [1,3,3,4,4,x]
4466                             ^ekey
4467       d_j     = [2,2,3,5,6,x]
4468                            ^nekye
4469     */
4470     if (nekey == ekey) { /* all entries are unique */
4471       delete cusp->cooPerm_a;
4472       cusp->cooPerm_a = NULL;
4473     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4474       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4475       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4476       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4477       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4478       w[0]                  = 0;
4479       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4480       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4481     }
4482     thrust::counting_iterator<PetscInt> search_begin(0);
4483     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4484                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4485                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4486     PetscCall(PetscLogGpuTimeEnd());
4487 
4488     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4489     a->singlemalloc = PETSC_FALSE;
4490     a->free_a       = PETSC_TRUE;
4491     a->free_ij      = PETSC_TRUE;
4492     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4493     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4494     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4495     a->nz = a->maxnz = a->i[A->rmap->n];
4496     a->rmax          = 0;
4497     PetscCall(PetscMalloc1(a->nz, &a->a));
4498     PetscCall(PetscMalloc1(a->nz, &a->j));
4499     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4500     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4501     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4502     for (PetscInt i = 0; i < A->rmap->n; i++) {
4503       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4504       nzr += (PetscInt) !!(nnzr);
4505       a->ilen[i] = a->imax[i] = nnzr;
4506       a->rmax                 = PetscMax(a->rmax, nnzr);
4507     }
4508     a->nonzerorowcnt = nzr;
4509     A->preallocated  = PETSC_TRUE;
4510     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4511     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4512     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4513     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4514   } else {
4515     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4516   }
4517   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4518 
4519   /* We want to allocate the CUSPARSE struct for matvec now.
4520      The code is so convoluted now that I prefer to copy zeros */
4521   PetscCall(PetscArrayzero(a->a, a->nz));
4522   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4523   A->offloadmask = PETSC_OFFLOAD_CPU;
4524   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4525   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4526   PetscFunctionReturn(PETSC_SUCCESS);
4527 }
4528 
4529 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4530 {
4531   Mat_SeqAIJ         *seq;
4532   Mat_SeqAIJCUSPARSE *dev;
4533   PetscBool           coo_basic = PETSC_TRUE;
4534   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4535 
4536   PetscFunctionBegin;
4537   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4538   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4539   if (coo_i) {
4540     PetscCall(PetscGetMemType(coo_i, &mtype));
4541     if (PetscMemTypeHost(mtype)) {
4542       for (PetscCount k = 0; k < coo_n; k++) {
4543         if (coo_i[k] < 0 || coo_j[k] < 0) {
4544           coo_basic = PETSC_FALSE;
4545           break;
4546         }
4547       }
4548     }
4549   }
4550 
4551   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4552     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4553   } else {
4554     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4555     mat->offloadmask = PETSC_OFFLOAD_CPU;
4556     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4557     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4558     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4559     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4560     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4561     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4562     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4563     dev->use_extended_coo = PETSC_TRUE;
4564   }
4565   PetscFunctionReturn(PETSC_SUCCESS);
4566 }
4567 
4568 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4569 {
4570   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4571   const PetscCount grid_size = gridDim.x * blockDim.x;
4572   for (; i < nnz; i += grid_size) {
4573     PetscScalar sum = 0.0;
4574     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4575     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4576   }
4577 }
4578 
4579 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4580 {
4581   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4582   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4583   PetscCount          Annz = seq->nz;
4584   PetscMemType        memtype;
4585   const PetscScalar  *v1 = v;
4586   PetscScalar        *Aa;
4587 
4588   PetscFunctionBegin;
4589   if (dev->use_extended_coo) {
4590     PetscCall(PetscGetMemType(v, &memtype));
4591     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4592       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4593       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4594     }
4595 
4596     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4597     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4598 
4599     if (Annz) {
4600       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4601       PetscCallCUDA(cudaPeekAtLastError());
4602     }
4603 
4604     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4605     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4606 
4607     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4608   } else {
4609     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4610   }
4611   PetscFunctionReturn(PETSC_SUCCESS);
4612 }
4613 
4614 /*@C
4615     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4616 
4617    Not Collective
4618 
4619     Input Parameters:
4620 +   A - the matrix
4621 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4622 
4623     Output Parameters:
4624 +   i - the CSR row pointers
4625 -   j - the CSR column indices
4626 
4627     Level: developer
4628 
4629     Note:
4630       When compressed is true, the CSR structure does not contain empty rows
4631 
4632 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4633 @*/
4634 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4635 {
4636   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4637   CsrMatrix          *csr;
4638   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4639 
4640   PetscFunctionBegin;
4641   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4642   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4643   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4644   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4645   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4646   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4647   csr = (CsrMatrix *)cusp->mat->mat;
4648   if (i) {
4649     if (!compressed && a->compressedrow.use) { /* need full row offset */
4650       if (!cusp->rowoffsets_gpu) {
4651         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4652         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4653         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4654       }
4655       *i = cusp->rowoffsets_gpu->data().get();
4656     } else *i = csr->row_offsets->data().get();
4657   }
4658   if (j) *j = csr->column_indices->data().get();
4659   PetscFunctionReturn(PETSC_SUCCESS);
4660 }
4661 
4662 /*@C
4663     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4664 
4665    Not Collective
4666 
4667     Input Parameters:
4668 +   A - the matrix
4669 .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4670 .   i - the CSR row pointers
4671 -   j - the CSR column indices
4672 
4673     Level: developer
4674 
4675 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4676 @*/
4677 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4678 {
4679   PetscFunctionBegin;
4680   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4681   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4682   if (i) *i = NULL;
4683   if (j) *j = NULL;
4684   (void)compressed;
4685   PetscFunctionReturn(PETSC_SUCCESS);
4686 }
4687 
4688 /*@C
4689    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4690 
4691    Not Collective
4692 
4693    Input Parameter:
4694 .   A - a `MATSEQAIJCUSPARSE` matrix
4695 
4696    Output Parameter:
4697 .   a - pointer to the device data
4698 
4699    Level: developer
4700 
4701    Note:
4702    May trigger host-device copies if up-to-date matrix data is on host
4703 
4704 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4705 @*/
4706 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4707 {
4708   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4709   CsrMatrix          *csr;
4710 
4711   PetscFunctionBegin;
4712   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4713   PetscValidPointer(a, 2);
4714   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4715   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4716   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4717   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4718   csr = (CsrMatrix *)cusp->mat->mat;
4719   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4720   *a = csr->values->data().get();
4721   PetscFunctionReturn(PETSC_SUCCESS);
4722 }
4723 
4724 /*@C
4725    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4726 
4727    Not Collective
4728 
4729    Input Parameters:
4730 +   A - a `MATSEQAIJCUSPARSE` matrix
4731 -   a - pointer to the device data
4732 
4733    Level: developer
4734 
4735 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4736 @*/
4737 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4738 {
4739   PetscFunctionBegin;
4740   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4741   PetscValidPointer(a, 2);
4742   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4743   *a = NULL;
4744   PetscFunctionReturn(PETSC_SUCCESS);
4745 }
4746 
4747 /*@C
4748    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4749 
4750    Not Collective
4751 
4752    Input Parameter:
4753 .   A - a `MATSEQAIJCUSPARSE` matrix
4754 
4755    Output Parameter:
4756 .   a - pointer to the device data
4757 
4758    Level: developer
4759 
4760    Note:
4761    May trigger host-device copies if up-to-date matrix data is on host
4762 
4763 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4764 @*/
4765 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4766 {
4767   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4768   CsrMatrix          *csr;
4769 
4770   PetscFunctionBegin;
4771   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4772   PetscValidPointer(a, 2);
4773   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4774   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4775   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4776   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4777   csr = (CsrMatrix *)cusp->mat->mat;
4778   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4779   *a             = csr->values->data().get();
4780   A->offloadmask = PETSC_OFFLOAD_GPU;
4781   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4782   PetscFunctionReturn(PETSC_SUCCESS);
4783 }
4784 /*@C
4785    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4786 
4787    Not Collective
4788 
4789    Input Parameters:
4790 +   A - a `MATSEQAIJCUSPARSE` matrix
4791 -   a - pointer to the device data
4792 
4793    Level: developer
4794 
4795 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4796 @*/
4797 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4798 {
4799   PetscFunctionBegin;
4800   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4801   PetscValidPointer(a, 2);
4802   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4803   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4804   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4805   *a = NULL;
4806   PetscFunctionReturn(PETSC_SUCCESS);
4807 }
4808 
4809 /*@C
4810    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4811 
4812    Not Collective
4813 
4814    Input Parameter:
4815 .   A - a `MATSEQAIJCUSPARSE` matrix
4816 
4817    Output Parameter:
4818 .   a - pointer to the device data
4819 
4820    Level: developer
4821 
4822    Note:
4823    Does not trigger host-device copies and flags data validity on the GPU
4824 
4825 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4826 @*/
4827 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4828 {
4829   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4830   CsrMatrix          *csr;
4831 
4832   PetscFunctionBegin;
4833   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4834   PetscValidPointer(a, 2);
4835   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4836   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4837   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4838   csr = (CsrMatrix *)cusp->mat->mat;
4839   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4840   *a             = csr->values->data().get();
4841   A->offloadmask = PETSC_OFFLOAD_GPU;
4842   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4843   PetscFunctionReturn(PETSC_SUCCESS);
4844 }
4845 
4846 /*@C
4847    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4848 
4849    Not Collective
4850 
4851    Input Parameters:
4852 +   A - a `MATSEQAIJCUSPARSE` matrix
4853 -   a - pointer to the device data
4854 
4855    Level: developer
4856 
4857 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4858 @*/
4859 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4860 {
4861   PetscFunctionBegin;
4862   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4863   PetscValidPointer(a, 2);
4864   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4865   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4866   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4867   *a = NULL;
4868   PetscFunctionReturn(PETSC_SUCCESS);
4869 }
4870 
4871 struct IJCompare4 {
4872   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4873   {
4874     if (t1.get<0>() < t2.get<0>()) return true;
4875     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4876     return false;
4877   }
4878 };
4879 
4880 struct Shift {
4881   int _shift;
4882 
4883   Shift(int shift) : _shift(shift) { }
4884   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4885 };
4886 
4887 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4888 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4889 {
4890   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4891   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4892   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4893   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4894   PetscInt                      Annz, Bnnz;
4895   cusparseStatus_t              stat;
4896   PetscInt                      i, m, n, zero = 0;
4897 
4898   PetscFunctionBegin;
4899   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4900   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4901   PetscValidPointer(C, 4);
4902   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4903   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4904   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4905   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4906   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4907   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4908   if (reuse == MAT_INITIAL_MATRIX) {
4909     m = A->rmap->n;
4910     n = A->cmap->n + B->cmap->n;
4911     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4912     PetscCall(MatSetSizes(*C, m, n, m, n));
4913     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4914     c                       = (Mat_SeqAIJ *)(*C)->data;
4915     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4916     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4917     Ccsr                    = new CsrMatrix;
4918     Cmat->cprowIndices      = NULL;
4919     c->compressedrow.use    = PETSC_FALSE;
4920     c->compressedrow.nrows  = 0;
4921     c->compressedrow.i      = NULL;
4922     c->compressedrow.rindex = NULL;
4923     Ccusp->workVector       = NULL;
4924     Ccusp->nrows            = m;
4925     Ccusp->mat              = Cmat;
4926     Ccusp->mat->mat         = Ccsr;
4927     Ccsr->num_rows          = m;
4928     Ccsr->num_cols          = n;
4929     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4930     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4931     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4932     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4933     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4934     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4935     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4936     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4937     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4938     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4939     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4940     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4941     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4942 
4943     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4944     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4945     Annz                 = (PetscInt)Acsr->column_indices->size();
4946     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4947     c->nz                = Annz + Bnnz;
4948     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4949     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4950     Ccsr->values         = new THRUSTARRAY(c->nz);
4951     Ccsr->num_entries    = c->nz;
4952     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4953     if (c->nz) {
4954       auto              Acoo = new THRUSTINTARRAY32(Annz);
4955       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4956       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4957       THRUSTINTARRAY32 *Aroff, *Broff;
4958 
4959       if (a->compressedrow.use) { /* need full row offset */
4960         if (!Acusp->rowoffsets_gpu) {
4961           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4962           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4963           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4964         }
4965         Aroff = Acusp->rowoffsets_gpu;
4966       } else Aroff = Acsr->row_offsets;
4967       if (b->compressedrow.use) { /* need full row offset */
4968         if (!Bcusp->rowoffsets_gpu) {
4969           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4970           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4971           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4972         }
4973         Broff = Bcusp->rowoffsets_gpu;
4974       } else Broff = Bcsr->row_offsets;
4975       PetscCall(PetscLogGpuTimeBegin());
4976       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4977       PetscCallCUSPARSE(stat);
4978       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4979       PetscCallCUSPARSE(stat);
4980       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4981       auto Aperm = thrust::make_constant_iterator(1);
4982       auto Bperm = thrust::make_constant_iterator(0);
4983 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4984       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4985       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4986 #else
4987       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4988       auto Bcib = Bcsr->column_indices->begin();
4989       auto Bcie = Bcsr->column_indices->end();
4990       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4991 #endif
4992       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4993       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4994       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4995       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4996       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4997       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4998       auto p1    = Ccusp->cooPerm->begin();
4999       auto p2    = Ccusp->cooPerm->begin();
5000       thrust::advance(p2, Annz);
5001       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
5002 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
5003       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
5004 #endif
5005       auto cci = thrust::make_counting_iterator(zero);
5006       auto cce = thrust::make_counting_iterator(c->nz);
5007 #if 0 //Errors on SUMMIT cuda 11.1.0
5008       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5009 #else
5010       auto pred = thrust::identity<int>();
5011       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
5012       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
5013 #endif
5014       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
5015       PetscCallCUSPARSE(stat);
5016       PetscCall(PetscLogGpuTimeEnd());
5017       delete wPerm;
5018       delete Acoo;
5019       delete Bcoo;
5020       delete Ccoo;
5021 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5022       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5023       PetscCallCUSPARSE(stat);
5024 #endif
5025       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5026         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5027         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5028         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5029         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5030         CsrMatrix                    *CcsrT = new CsrMatrix;
5031         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5032         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5033 
5034         (*C)->form_explicit_transpose = PETSC_TRUE;
5035         (*C)->transupdated            = PETSC_TRUE;
5036         Ccusp->rowoffsets_gpu         = NULL;
5037         CmatT->cprowIndices           = NULL;
5038         CmatT->mat                    = CcsrT;
5039         CcsrT->num_rows               = n;
5040         CcsrT->num_cols               = m;
5041         CcsrT->num_entries            = c->nz;
5042 
5043         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
5044         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5045         CcsrT->values         = new THRUSTARRAY(c->nz);
5046 
5047         PetscCall(PetscLogGpuTimeBegin());
5048         auto rT = CcsrT->row_offsets->begin();
5049         if (AT) {
5050           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
5051           thrust::advance(rT, -1);
5052         }
5053         if (BT) {
5054           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
5055           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
5056           thrust::copy(titb, tite, rT);
5057         }
5058         auto cT = CcsrT->column_indices->begin();
5059         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
5060         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
5061         auto vT = CcsrT->values->begin();
5062         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5063         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5064         PetscCall(PetscLogGpuTimeEnd());
5065 
5066         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5067         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5068         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5069         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
5070         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
5071         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5072         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5073         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5074         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5075 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5076         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5077         PetscCallCUSPARSE(stat);
5078 #endif
5079         Ccusp->matTranspose = CmatT;
5080       }
5081     }
5082 
5083     c->singlemalloc = PETSC_FALSE;
5084     c->free_a       = PETSC_TRUE;
5085     c->free_ij      = PETSC_TRUE;
5086     PetscCall(PetscMalloc1(m + 1, &c->i));
5087     PetscCall(PetscMalloc1(c->nz, &c->j));
5088     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
5089       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5090       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5091       ii = *Ccsr->row_offsets;
5092       jj = *Ccsr->column_indices;
5093       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5094       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5095     } else {
5096       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5097       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5098     }
5099     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
5100     PetscCall(PetscMalloc1(m, &c->ilen));
5101     PetscCall(PetscMalloc1(m, &c->imax));
5102     c->maxnz         = c->nz;
5103     c->nonzerorowcnt = 0;
5104     c->rmax          = 0;
5105     for (i = 0; i < m; i++) {
5106       const PetscInt nn = c->i[i + 1] - c->i[i];
5107       c->ilen[i] = c->imax[i] = nn;
5108       c->nonzerorowcnt += (PetscInt) !!nn;
5109       c->rmax = PetscMax(c->rmax, nn);
5110     }
5111     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5112     PetscCall(PetscMalloc1(c->nz, &c->a));
5113     (*C)->nonzerostate++;
5114     PetscCall(PetscLayoutSetUp((*C)->rmap));
5115     PetscCall(PetscLayoutSetUp((*C)->cmap));
5116     Ccusp->nonzerostate = (*C)->nonzerostate;
5117     (*C)->preallocated  = PETSC_TRUE;
5118   } else {
5119     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5120     c = (Mat_SeqAIJ *)(*C)->data;
5121     if (c->nz) {
5122       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
5123       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
5124       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
5125       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
5126       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5127       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5128       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5129       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5130       Acsr = (CsrMatrix *)Acusp->mat->mat;
5131       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5132       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5133       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5134       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5135       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5136       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
5137       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
5138       auto pmid = Ccusp->cooPerm->begin();
5139       thrust::advance(pmid, Acsr->num_entries);
5140       PetscCall(PetscLogGpuTimeBegin());
5141       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
5142       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5143       thrust::for_each(zibait, zieait, VecCUDAEquals());
5144       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5145       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
5146       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5147       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5148       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5149         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5150         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5151         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5152         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5153         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5154         auto       vT    = CcsrT->values->begin();
5155         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5156         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5157         (*C)->transupdated = PETSC_TRUE;
5158       }
5159       PetscCall(PetscLogGpuTimeEnd());
5160     }
5161   }
5162   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5163   (*C)->assembled     = PETSC_TRUE;
5164   (*C)->was_assembled = PETSC_FALSE;
5165   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5166   PetscFunctionReturn(PETSC_SUCCESS);
5167 }
5168 
5169 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5170 {
5171   bool               dmem;
5172   const PetscScalar *av;
5173 
5174   PetscFunctionBegin;
5175   dmem = isCudaMem(v);
5176   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5177   if (n && idx) {
5178     THRUSTINTARRAY widx(n);
5179     widx.assign(idx, idx + n);
5180     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5181 
5182     THRUSTARRAY                    *w = NULL;
5183     thrust::device_ptr<PetscScalar> dv;
5184     if (dmem) {
5185       dv = thrust::device_pointer_cast(v);
5186     } else {
5187       w  = new THRUSTARRAY(n);
5188       dv = w->data();
5189     }
5190     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5191 
5192     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5193     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5194     thrust::for_each(zibit, zieit, VecCUDAEquals());
5195     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5196     delete w;
5197   } else {
5198     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5199   }
5200   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5201   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5202   PetscFunctionReturn(PETSC_SUCCESS);
5203 }
5204