xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72 #endif
73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
83 
84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95 
96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97 {
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
99 
100   PetscFunctionBegin;
101   switch (op) {
102   case MAT_CUSPARSE_MULT:
103     cusparsestruct->format = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparsestruct->format = format;
107     break;
108   default:
109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*@
115    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116    operation. Only the `MatMult()` operation can use different GPU storage formats
117 
118    Not Collective
119 
120    Input Parameters:
121 +  A - Matrix of type `MATSEQAIJCUSPARSE`
122 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125 
126    Level: intermediate
127 
128 .seealso: [](ch_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149 
150    Input Parameters:
151 +  A - Matrix of type `MATSEQAIJCUSPARSE`
152 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
153 
154    Level: intermediate
155 
156    Note:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162 @*/
163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164 {
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172 {
173   PetscFunctionBegin;
174   switch (op) {
175   case MAT_FORM_EXPLICIT_TRANSPOSE:
176     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178     A->form_explicit_transpose = flg;
179     break;
180   default:
181     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182     break;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188 {
189   MatCUSPARSEStorageFormat format;
190   PetscBool                flg;
191   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
192 
193   PetscFunctionBegin;
194   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195   if (A->factortype == MAT_FACTOR_NONE) {
196     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198 
199     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208   #else
209     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #endif
211     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213 
214     PetscCall(
215       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217 #endif
218   }
219   PetscOptionsHeadEnd();
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225 {
226   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227   PetscInt                      m  = A->rmap->n;
228   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230   const MatScalar              *Aa = a->a;
231   PetscInt                     *Mi, *Mj, Mnz;
232   PetscScalar                  *Ma;
233 
234   PetscFunctionBegin;
235   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239       PetscCall(PetscMalloc1(m + 1, &Mi));
240       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241       PetscCall(PetscMalloc1(Mnz, &Ma));
242       Mi[0] = 0;
243       for (PetscInt i = 0; i < m; i++) {
244         PetscInt llen = Ai[i + 1] - Ai[i];
245         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249         Mi[i + 1] = Mi[i] + llen + ulen;
250       }
251       // Copy M (L,U) from host to device
252       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
257 
258       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266 
267       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270 
271       fillMode = CUSPARSE_FILL_MODE_UPPER;
272       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276 
277       // Allocate work vectors in SpSv
278       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
280 
281       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283 
284       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291 
292       // Record for reuse
293       fs->csrRowPtr_h = Mi;
294       fs->csrVal_h    = Ma;
295       PetscCall(PetscFree(Mj));
296     }
297     // Copy the value
298     Mi  = fs->csrRowPtr_h;
299     Ma  = fs->csrVal_h;
300     Mnz = Mi[m];
301     for (PetscInt i = 0; i < m; i++) {
302       PetscInt llen = Ai[i + 1] - Ai[i];
303       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307     }
308     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309 
310     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312 
313     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314 
315     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 #else
321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322 {
323   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324   PetscInt                           n                  = A->rmap->n;
325   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328   const MatScalar                   *aa = a->a, *v;
329   PetscInt                          *AiLo, *AjLo;
330   PetscInt                           i, nz, nzLower, offset, rowOffset;
331 
332   PetscFunctionBegin;
333   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335     try {
336       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337       nzLower = n + ai[n] - ai[1];
338       if (!loTriFactor) {
339         PetscScalar *AALo;
340 
341         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
342 
343         /* Allocate Space for the lower triangular matrix */
344         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
346 
347         /* Fill the lower triangular matrix */
348         AiLo[0]   = (PetscInt)0;
349         AiLo[n]   = nzLower;
350         AjLo[0]   = (PetscInt)0;
351         AALo[0]   = (MatScalar)1.0;
352         v         = aa;
353         vi        = aj;
354         offset    = 1;
355         rowOffset = 1;
356         for (i = 1; i < n; i++) {
357           nz = ai[i + 1] - ai[i];
358           /* additional 1 for the term on the diagonal */
359           AiLo[i] = rowOffset;
360           rowOffset += nz + 1;
361 
362           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
364 
365           offset += nz;
366           AjLo[offset] = (PetscInt)i;
367           AALo[offset] = (MatScalar)1.0;
368           offset += 1;
369 
370           v += nz;
371           vi += nz;
372         }
373 
374         /* allocate space for the triangular factor information */
375         PetscCall(PetscNew(&loTriFactor));
376         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377         /* Create the matrix description */
378         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382   #else
383         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384   #endif
385         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387 
388         /* set the operation */
389         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390 
391         /* set the matrix */
392         loTriFactor->csrMat              = new CsrMatrix;
393         loTriFactor->csrMat->num_rows    = n;
394         loTriFactor->csrMat->num_cols    = n;
395         loTriFactor->csrMat->num_entries = nzLower;
396 
397         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399 
400         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402 
403         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405 
406         /* Create the solve analysis information */
407         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413   #endif
414 
415         /* perform the solve analysis */
416         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418         PetscCallCUDA(WaitForCUDA());
419         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420 
421         /* assign the pointer */
422         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423         loTriFactor->AA_h                                          = AALo;
424         PetscCallCUDA(cudaFreeHost(AiLo));
425         PetscCallCUDA(cudaFreeHost(AjLo));
426         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427       } else { /* update values only */
428         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429         /* Fill the lower triangular matrix */
430         loTriFactor->AA_h[0] = 1.0;
431         v                    = aa;
432         vi                   = aj;
433         offset               = 1;
434         for (i = 1; i < n; i++) {
435           nz = ai[i + 1] - ai[i];
436           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437           offset += nz;
438           loTriFactor->AA_h[offset] = 1.0;
439           offset += 1;
440           v += nz;
441         }
442         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444       }
445     } catch (char *ex) {
446       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447     }
448   }
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453 {
454   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455   PetscInt                           n                  = A->rmap->n;
456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459   const MatScalar                   *aa = a->a, *v;
460   PetscInt                          *AiUp, *AjUp;
461   PetscInt                           i, nz, nzUpper, offset;
462 
463   PetscFunctionBegin;
464   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466     try {
467       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468       nzUpper = adiag[0] - adiag[n];
469       if (!upTriFactor) {
470         PetscScalar *AAUp;
471 
472         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
473 
474         /* Allocate Space for the upper triangular matrix */
475         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
477 
478         /* Fill the upper triangular matrix */
479         AiUp[0] = (PetscInt)0;
480         AiUp[n] = nzUpper;
481         offset  = nzUpper;
482         for (i = n - 1; i >= 0; i--) {
483           v  = aa + adiag[i + 1] + 1;
484           vi = aj + adiag[i + 1] + 1;
485 
486           /* number of elements NOT on the diagonal */
487           nz = adiag[i] - adiag[i + 1] - 1;
488 
489           /* decrement the offset */
490           offset -= (nz + 1);
491 
492           /* first, set the diagonal elements */
493           AjUp[offset] = (PetscInt)i;
494           AAUp[offset] = (MatScalar)1. / v[nz];
495           AiUp[i]      = AiUp[i + 1] - (nz + 1);
496 
497           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499         }
500 
501         /* allocate space for the triangular factor information */
502         PetscCall(PetscNew(&upTriFactor));
503         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
504 
505         /* Create the matrix description */
506         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510   #else
511         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512   #endif
513         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515 
516         /* set the operation */
517         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518 
519         /* set the matrix */
520         upTriFactor->csrMat              = new CsrMatrix;
521         upTriFactor->csrMat->num_rows    = n;
522         upTriFactor->csrMat->num_cols    = n;
523         upTriFactor->csrMat->num_entries = nzUpper;
524 
525         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527 
528         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530 
531         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533 
534         /* Create the solve analysis information */
535         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541   #endif
542 
543         /* perform the solve analysis */
544         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
546 
547         PetscCallCUDA(WaitForCUDA());
548         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549 
550         /* assign the pointer */
551         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552         upTriFactor->AA_h                                          = AAUp;
553         PetscCallCUDA(cudaFreeHost(AiUp));
554         PetscCallCUDA(cudaFreeHost(AjUp));
555         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556       } else {
557         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558         /* Fill the upper triangular matrix */
559         offset = nzUpper;
560         for (i = n - 1; i >= 0; i--) {
561           v = aa + adiag[i + 1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i + 1] - 1;
565 
566           /* decrement the offset */
567           offset -= (nz + 1);
568 
569           /* first, set the diagonal elements */
570           upTriFactor->AA_h[offset] = 1. / v[nz];
571           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572         }
573         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575       }
576     } catch (char *ex) {
577       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578     }
579   }
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 #endif
583 
584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585 {
586   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588   IS                            isrow = a->row, iscol = a->icol;
589   PetscBool                     row_identity, col_identity;
590   PetscInt                      n = A->rmap->n;
591 
592   PetscFunctionBegin;
593   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596 #else
597   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600 #endif
601 
602   cusparseTriFactors->nnz = a->nz;
603 
604   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605   /* lower triangular indices */
606   PetscCall(ISIdentity(isrow, &row_identity));
607   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608     const PetscInt *r;
609 
610     PetscCall(ISGetIndices(isrow, &r));
611     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612     cusparseTriFactors->rpermIndices->assign(r, r + n);
613     PetscCall(ISRestoreIndices(isrow, &r));
614     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615   }
616 
617   /* upper triangular indices */
618   PetscCall(ISIdentity(iscol, &col_identity));
619   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620     const PetscInt *c;
621 
622     PetscCall(ISGetIndices(iscol, &c));
623     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624     cusparseTriFactors->cpermIndices->assign(c, c + n);
625     PetscCall(ISRestoreIndices(iscol, &c));
626     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633 {
634   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635   PetscInt                      m  = A->rmap->n;
636   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638   const MatScalar              *Aa = a->a;
639   PetscInt                     *Mj, Mnz;
640   PetscScalar                  *Ma, *D;
641 
642   PetscFunctionBegin;
643   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647       Mnz = Ai[m]; // Unz (with the unit diagonal)
648       PetscCall(PetscMalloc1(Mnz, &Ma));
649       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651       for (PetscInt i = 0; i < m; i++) {
652         PetscInt ulen = Ai[i + 1] - Ai[i];
653         Mj[Ai[i]]     = i;                                              // diagonal entry
654         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655       }
656       // Copy M (U) from host to device
657       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663 
664       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672 
673       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676 
677       // Allocate work vectors in SpSv
678       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
680 
681       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683 
684       // Query buffer sizes for SpSV and then allocate buffers
685       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688 
689       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692 
693       // Record for reuse
694       fs->csrVal_h = Ma;
695       fs->diag_h   = D;
696       PetscCall(PetscFree(Mj));
697     }
698     // Copy the value
699     Ma  = fs->csrVal_h;
700     D   = fs->diag_h;
701     Mnz = Ai[m];
702     for (PetscInt i = 0; i < m; i++) {
703       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706     }
707     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709 
710     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 // Solve Ut D U x = b
718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719 {
720   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722   const PetscScalar                    *barray;
723   PetscScalar                          *xarray;
724   thrust::device_ptr<const PetscScalar> bGPU;
725   thrust::device_ptr<PetscScalar>       xGPU;
726   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727   PetscInt                              m   = A->rmap->n;
728 
729   PetscFunctionBegin;
730   PetscCall(PetscLogGpuTimeBegin());
731   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732   PetscCall(VecCUDAGetArrayRead(b, &barray));
733   xGPU = thrust::device_pointer_cast(xarray);
734   bGPU = thrust::device_pointer_cast(barray);
735 
736   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737   if (fs->rpermIndices) {
738     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740   } else {
741     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742   }
743 
744   // Solve Ut Y = X
745   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747 
748   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749   // It is basically a vector element-wise multiplication, but cublas does not have it!
750   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   /* Factorize fact inplace */
1719   if (m)
1720     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722   if (PetscDefined(USE_DEBUG)) {
1723     int              numerical_zero;
1724     cusparseStatus_t status;
1725     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727   }
1728 
1729   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731   */
1732   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1733 
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1735 
1736   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1738 
1739   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742   fact->ops->matsolve          = NULL;
1743   fact->ops->matsolvetranspose = NULL;
1744   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745   PetscFunctionReturn(PETSC_SUCCESS);
1746 }
1747 
1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749 {
1750   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752   PetscInt                      m, nz;
1753 
1754   PetscFunctionBegin;
1755   if (PetscDefined(USE_DEBUG)) {
1756     PetscInt  i;
1757     PetscBool flg, missing;
1758 
1759     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764   }
1765 
1766   /* Free the old stale stuff */
1767   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1768 
1769   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770      but they will not be used. Allocate them just for easy debugging.
1771    */
1772   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1773 
1774   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775   fact->factortype             = MAT_FACTOR_ILU;
1776   fact->info.factor_mallocs    = 0;
1777   fact->info.fill_ratio_given  = info->fill;
1778   fact->info.fill_ratio_needed = 1.0;
1779 
1780   aij->row = NULL;
1781   aij->col = NULL;
1782 
1783   /* ====================================================================== */
1784   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785   /* We'll do in-place factorization on fact                                */
1786   /* ====================================================================== */
1787   const int *Ai, *Aj;
1788 
1789   m  = fact->rmap->n;
1790   nz = aij->nz;
1791 
1792   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1798 
1799   /* ====================================================================== */
1800   /* Create descriptors for M, L, U                                         */
1801   /* ====================================================================== */
1802   cusparseFillMode_t fillMode;
1803   cusparseDiagType_t diagType;
1804 
1805   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1808 
1809   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814   */
1815   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1820 
1821   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826 
1827   /* ========================================================================= */
1828   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829   /* ========================================================================= */
1830   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831   if (m)
1832     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1834 
1835   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1837 
1838   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1840 
1841   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1846 
1847   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851    */
1852   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854     fs->spsvBuffer_L = fs->factBuffer_M;
1855     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856   } else {
1857     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858     fs->spsvBuffer_U = fs->factBuffer_M;
1859     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860   }
1861 
1862   /* ========================================================================== */
1863   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865   /* ========================================================================== */
1866   int              structural_zero;
1867   cusparseStatus_t status;
1868 
1869   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870   if (m)
1871     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873   if (PetscDefined(USE_DEBUG)) {
1874     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877   }
1878 
1879   /* Estimate FLOPs of the numeric factorization */
1880   {
1881     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1882     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883     PetscLogDouble flops = 0.0;
1884 
1885     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886     Ai    = Aseq->i;
1887     Adiag = Aseq->diag;
1888     for (PetscInt i = 0; i < m; i++) {
1889       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890         nzRow  = Ai[i + 1] - Ai[i];
1891         nzLeft = Adiag[i] - Ai[i];
1892         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894         */
1895         nzLeft = (nzRow - 1) / 2;
1896         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897       }
1898     }
1899     fs->numericFactFlops = flops;
1900   }
1901   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902   PetscFunctionReturn(PETSC_SUCCESS);
1903 }
1904 
1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906 {
1907   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909   const PetscScalar            *barray;
1910   PetscScalar                  *xarray;
1911 
1912   PetscFunctionBegin;
1913   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915   PetscCall(PetscLogGpuTimeBegin());
1916 
1917   /* Solve L*y = b */
1918   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1922 
1923   /* Solve Lt*x = y */
1924   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1927 
1928   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1930 
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937 {
1938   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941   CsrMatrix                    *Acsr;
1942   PetscInt                      m, nz;
1943   PetscBool                     flg;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949   }
1950 
1951   /* Copy A's value to fact */
1952   m  = fact->rmap->n;
1953   nz = aij->nz;
1954   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1957 
1958   /* Factorize fact inplace */
1959   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964    */
1965   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966   if (PetscDefined(USE_DEBUG)) {
1967     int              numerical_zero;
1968     cusparseStatus_t status;
1969     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971   }
1972 
1973   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1974 
1975   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977   */
1978   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1979 
1980   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983   fact->ops->matsolve          = NULL;
1984   fact->ops->matsolvetranspose = NULL;
1985   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990 {
1991   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993   PetscInt                      m, nz;
1994 
1995   PetscFunctionBegin;
1996   if (PetscDefined(USE_DEBUG)) {
1997     PetscInt  i;
1998     PetscBool flg, missing;
1999 
2000     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005   }
2006 
2007   /* Free the old stale stuff */
2008   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2009 
2010   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011      but they will not be used. Allocate them just for easy debugging.
2012    */
2013   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2014 
2015   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016   fact->factortype             = MAT_FACTOR_ICC;
2017   fact->info.factor_mallocs    = 0;
2018   fact->info.fill_ratio_given  = info->fill;
2019   fact->info.fill_ratio_needed = 1.0;
2020 
2021   aij->row = NULL;
2022   aij->col = NULL;
2023 
2024   /* ====================================================================== */
2025   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026   /* We'll do in-place factorization on fact                                */
2027   /* ====================================================================== */
2028   const int *Ai, *Aj;
2029 
2030   m  = fact->rmap->n;
2031   nz = aij->nz;
2032 
2033   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2039 
2040   /* ====================================================================== */
2041   /* Create mat descriptors for M, L                                        */
2042   /* ====================================================================== */
2043   cusparseFillMode_t fillMode;
2044   cusparseDiagType_t diagType;
2045 
2046   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2049 
2050   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055   */
2056   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2061 
2062   /* ========================================================================= */
2063   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064   /* ========================================================================= */
2065   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2067 
2068   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2070 
2071   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2073 
2074   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2079 
2080   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082    */
2083   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085     fs->spsvBuffer_L = fs->factBuffer_M;
2086     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087   } else {
2088     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089     fs->spsvBuffer_Lt = fs->factBuffer_M;
2090     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091   }
2092 
2093   /* ========================================================================== */
2094   /* Perform analysis of ic0 on M                                               */
2095   /* The lower triangular part of M has the same sparsity pattern as L          */
2096   /* ========================================================================== */
2097   int              structural_zero;
2098   cusparseStatus_t status;
2099 
2100   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102   if (PetscDefined(USE_DEBUG)) {
2103     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106   }
2107 
2108   /* Estimate FLOPs of the numeric factorization */
2109   {
2110     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2111     PetscInt      *Ai, nzRow, nzLeft;
2112     PetscLogDouble flops = 0.0;
2113 
2114     Ai = Aseq->i;
2115     for (PetscInt i = 0; i < m; i++) {
2116       nzRow = Ai[i + 1] - Ai[i];
2117       if (nzRow > 1) {
2118         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120         */
2121         nzLeft = (nzRow - 1) / 2;
2122         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123       }
2124     }
2125     fs->numericFactFlops = flops;
2126   }
2127   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 #endif
2131 
2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133 {
2134   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2135   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2136 
2137   PetscFunctionBegin;
2138   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2139   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2140   B->offloadmask = PETSC_OFFLOAD_CPU;
2141 
2142   if (!cusparsestruct->use_cpu_solve) {
2143 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2144     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2145     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2146 #else
2147     /* determine which version of MatSolve needs to be used. */
2148     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2149     IS          isrow = b->row, iscol = b->col;
2150     PetscBool   row_identity, col_identity;
2151 
2152     PetscCall(ISIdentity(isrow, &row_identity));
2153     PetscCall(ISIdentity(iscol, &col_identity));
2154     if (row_identity && col_identity) {
2155       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2156       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2157     } else {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2160     }
2161 #endif
2162   }
2163   B->ops->matsolve          = NULL;
2164   B->ops->matsolvetranspose = NULL;
2165 
2166   /* get the triangular factors */
2167   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 
2171 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2172 {
2173   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2174 
2175   PetscFunctionBegin;
2176   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2177   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2178   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2179   PetscFunctionReturn(PETSC_SUCCESS);
2180 }
2181 
2182 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2183 {
2184   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2185 
2186   PetscFunctionBegin;
2187 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2188   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2189   if (cusparseTriFactors->factorizeOnDevice) {
2190     PetscCall(ISIdentity(isrow, &row_identity));
2191     PetscCall(ISIdentity(iscol, &col_identity));
2192   }
2193   if (!info->levels && row_identity && col_identity) {
2194     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2195   } else
2196 #endif
2197   {
2198     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2199     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2200     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2201   }
2202   PetscFunctionReturn(PETSC_SUCCESS);
2203 }
2204 
2205 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2206 {
2207   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2208 
2209   PetscFunctionBegin;
2210 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2211   PetscBool perm_identity = PETSC_FALSE;
2212   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2213   if (!info->levels && perm_identity) {
2214     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2215   } else
2216 #endif
2217   {
2218     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2219     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2220     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2221   }
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2226 {
2227   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2228 
2229   PetscFunctionBegin;
2230   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2231   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2232   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2233   PetscFunctionReturn(PETSC_SUCCESS);
2234 }
2235 
2236 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2237 {
2238   PetscFunctionBegin;
2239   *type = MATSOLVERCUSPARSE;
2240   PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242 
2243 /*MC
2244   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2245   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2246   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2247   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2248   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2249   algorithms are not recommended. This class does NOT support direct solver operations.
2250 
2251   Level: beginner
2252 
2253 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2254           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2255 M*/
2256 
2257 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2258 {
2259   PetscInt  n = A->rmap->n;
2260   PetscBool factOnDevice, factOnHost;
2261   char     *prefix;
2262   char      factPlace[32] = "device"; /* the default */
2263 
2264   PetscFunctionBegin;
2265   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266   PetscCall(MatSetSizes(*B, n, n, n, n));
2267   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2269 
2270   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2271   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2272   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2273   PetscOptionsEnd();
2274   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2275   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2276   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2277   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2278 
2279   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2280   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2281     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2282     if (!A->boundtocpu) {
2283       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2284       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2285     } else {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2288     }
2289     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2290     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2291     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2292   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2293     if (!A->boundtocpu) {
2294       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2295       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2296     } else {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2299     }
2300     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2301     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2302   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2303 
2304   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2305   (*B)->canuseordering = PETSC_TRUE;
2306   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2307   PetscFunctionReturn(PETSC_SUCCESS);
2308 }
2309 
2310 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2311 {
2312   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2313   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2315   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2316 #endif
2317 
2318   PetscFunctionBegin;
2319   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2320     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2321     if (A->factortype == MAT_FACTOR_NONE) {
2322       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2323       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2324     }
2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326     else if (fs->csrVal) {
2327       /* We have a factorized matrix on device and are able to copy it to host */
2328       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329     }
2330 #endif
2331     else
2332       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2333     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2334     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2335     A->offloadmask = PETSC_OFFLOAD_BOTH;
2336   }
2337   PetscFunctionReturn(PETSC_SUCCESS);
2338 }
2339 
2340 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2341 {
2342   PetscFunctionBegin;
2343   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2344   *array = ((Mat_SeqAIJ *)A->data)->a;
2345   PetscFunctionReturn(PETSC_SUCCESS);
2346 }
2347 
2348 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2349 {
2350   PetscFunctionBegin;
2351   A->offloadmask = PETSC_OFFLOAD_CPU;
2352   *array         = NULL;
2353   PetscFunctionReturn(PETSC_SUCCESS);
2354 }
2355 
2356 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2357 {
2358   PetscFunctionBegin;
2359   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2360   *array = ((Mat_SeqAIJ *)A->data)->a;
2361   PetscFunctionReturn(PETSC_SUCCESS);
2362 }
2363 
2364 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2365 {
2366   PetscFunctionBegin;
2367   *array = NULL;
2368   PetscFunctionReturn(PETSC_SUCCESS);
2369 }
2370 
2371 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2372 {
2373   PetscFunctionBegin;
2374   *array = ((Mat_SeqAIJ *)A->data)->a;
2375   PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377 
2378 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379 {
2380   PetscFunctionBegin;
2381   A->offloadmask = PETSC_OFFLOAD_CPU;
2382   *array         = NULL;
2383   PetscFunctionReturn(PETSC_SUCCESS);
2384 }
2385 
2386 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2387 {
2388   Mat_SeqAIJCUSPARSE *cusp;
2389   CsrMatrix          *matrix;
2390 
2391   PetscFunctionBegin;
2392   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2393   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2394   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2395   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2396   matrix = (CsrMatrix *)cusp->mat->mat;
2397 
2398   if (i) {
2399 #if !defined(PETSC_USE_64BIT_INDICES)
2400     *i = matrix->row_offsets->data().get();
2401 #else
2402     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2403 #endif
2404   }
2405   if (j) {
2406 #if !defined(PETSC_USE_64BIT_INDICES)
2407     *j = matrix->column_indices->data().get();
2408 #else
2409     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410 #endif
2411   }
2412   if (a) *a = matrix->values->data().get();
2413   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2414   PetscFunctionReturn(PETSC_SUCCESS);
2415 }
2416 
2417 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2418 {
2419   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2420   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2421   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2422   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2423   cusparseStatus_t              stat;
2424   PetscBool                     both = PETSC_TRUE;
2425 
2426   PetscFunctionBegin;
2427   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2428   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2429     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2430       CsrMatrix *matrix;
2431       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2432 
2433       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2434       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2435       matrix->values->assign(a->a, a->a + a->nz);
2436       PetscCallCUDA(WaitForCUDA());
2437       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2438       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2439       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2440     } else {
2441       PetscInt nnz;
2442       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2443       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2444       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2445       delete cusparsestruct->workVector;
2446       delete cusparsestruct->rowoffsets_gpu;
2447       cusparsestruct->workVector     = NULL;
2448       cusparsestruct->rowoffsets_gpu = NULL;
2449       try {
2450         if (a->compressedrow.use) {
2451           m    = a->compressedrow.nrows;
2452           ii   = a->compressedrow.i;
2453           ridx = a->compressedrow.rindex;
2454         } else {
2455           m    = A->rmap->n;
2456           ii   = a->i;
2457           ridx = NULL;
2458         }
2459         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2460         if (!a->a) {
2461           nnz  = ii[m];
2462           both = PETSC_FALSE;
2463         } else nnz = a->nz;
2464         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2465 
2466         /* create cusparse matrix */
2467         cusparsestruct->nrows = m;
2468         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2469         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2470         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2471         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2472 
2473         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2474         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2475         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2480 
2481         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2482         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2483           /* set the matrix */
2484           CsrMatrix *mat   = new CsrMatrix;
2485           mat->num_rows    = m;
2486           mat->num_cols    = A->cmap->n;
2487           mat->num_entries = nnz;
2488           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2489           mat->row_offsets->assign(ii, ii + m + 1);
2490 
2491           mat->column_indices = new THRUSTINTARRAY32(nnz);
2492           mat->column_indices->assign(a->j, a->j + nnz);
2493 
2494           mat->values = new THRUSTARRAY(nnz);
2495           if (a->a) mat->values->assign(a->a, a->a + nnz);
2496 
2497           /* assign the pointer */
2498           matstruct->mat = mat;
2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503             PetscCallCUSPARSE(stat);
2504           }
2505 #endif
2506         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509 #else
2510           CsrMatrix *mat   = new CsrMatrix;
2511           mat->num_rows    = m;
2512           mat->num_cols    = A->cmap->n;
2513           mat->num_entries = nnz;
2514           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2515           mat->row_offsets->assign(ii, ii + m + 1);
2516 
2517           mat->column_indices = new THRUSTINTARRAY32(nnz);
2518           mat->column_indices->assign(a->j, a->j + nnz);
2519 
2520           mat->values = new THRUSTARRAY(nnz);
2521           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522 
2523           cusparseHybMat_t hybMat;
2524           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527           PetscCallCUSPARSE(stat);
2528           /* assign the pointer */
2529           matstruct->mat = hybMat;
2530 
2531           if (mat) {
2532             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535             delete (CsrMatrix *)mat;
2536           }
2537 #endif
2538         }
2539 
2540         /* assign the compressed row indices */
2541         if (a->compressedrow.use) {
2542           cusparsestruct->workVector = new THRUSTARRAY(m);
2543           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2544           matstruct->cprowIndices->assign(ridx, ridx + m);
2545           tmp = m;
2546         } else {
2547           cusparsestruct->workVector = NULL;
2548           matstruct->cprowIndices    = NULL;
2549           tmp                        = 0;
2550         }
2551         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2552 
2553         /* assign the pointer */
2554         cusparsestruct->mat = matstruct;
2555       } catch (char *ex) {
2556         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557       }
2558       PetscCallCUDA(WaitForCUDA());
2559       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560       cusparsestruct->nonzerostate = A->nonzerostate;
2561     }
2562     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563   }
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 struct VecCUDAPlusEquals {
2568   template <typename Tuple>
2569   __host__ __device__ void operator()(Tuple t)
2570   {
2571     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572   }
2573 };
2574 
2575 struct VecCUDAEquals {
2576   template <typename Tuple>
2577   __host__ __device__ void operator()(Tuple t)
2578   {
2579     thrust::get<1>(t) = thrust::get<0>(t);
2580   }
2581 };
2582 
2583 struct VecCUDAEqualsReverse {
2584   template <typename Tuple>
2585   __host__ __device__ void operator()(Tuple t)
2586   {
2587     thrust::get<0>(t) = thrust::get<1>(t);
2588   }
2589 };
2590 
2591 struct MatMatCusparse {
2592   PetscBool      cisdense;
2593   PetscScalar   *Bt;
2594   Mat            X;
2595   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596   PetscLogDouble flops;
2597   CsrMatrix     *Bcsr;
2598 
2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600   cusparseSpMatDescr_t matSpBDescr;
2601   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602   cusparseDnMatDescr_t matBDescr;
2603   cusparseDnMatDescr_t matCDescr;
2604   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606   void *dBuffer4;
2607   void *dBuffer5;
2608   #endif
2609   size_t                mmBufferSize;
2610   void                 *mmBuffer;
2611   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612   cusparseSpGEMMDescr_t spgemmDesc;
2613 #endif
2614 };
2615 
2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617 {
2618   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2619 
2620   PetscFunctionBegin;
2621   PetscCallCUDA(cudaFree(mmdata->Bt));
2622   delete mmdata->Bcsr;
2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631   #endif
2632   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634 #endif
2635   PetscCall(MatDestroy(&mmdata->X));
2636   PetscCall(PetscFree(data));
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2641 
2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643 {
2644   Mat_Product                  *product = C->product;
2645   Mat                           A, B;
2646   PetscInt                      m, n, blda, clda;
2647   PetscBool                     flg, biscuda;
2648   Mat_SeqAIJCUSPARSE           *cusp;
2649   cusparseStatus_t              stat;
2650   cusparseOperation_t           opA;
2651   const PetscScalar            *barray;
2652   PetscScalar                  *carray;
2653   MatMatCusparse               *mmdata;
2654   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655   CsrMatrix                    *csrmat;
2656 
2657   PetscFunctionBegin;
2658   MatCheckProduct(C, 1);
2659   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660   mmdata = (MatMatCusparse *)product->data;
2661   A      = product->A;
2662   B      = product->B;
2663   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670   switch (product->type) {
2671   case MATPRODUCT_AB:
2672   case MATPRODUCT_PtAP:
2673     mat = cusp->mat;
2674     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675     m   = A->rmap->n;
2676     n   = B->cmap->n;
2677     break;
2678   case MATPRODUCT_AtB:
2679     if (!A->form_explicit_transpose) {
2680       mat = cusp->mat;
2681       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682     } else {
2683       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684       mat = cusp->matTranspose;
2685       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686     }
2687     m = A->cmap->n;
2688     n = B->cmap->n;
2689     break;
2690   case MATPRODUCT_ABt:
2691   case MATPRODUCT_RARt:
2692     mat = cusp->mat;
2693     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694     m   = A->rmap->n;
2695     n   = B->rmap->n;
2696     break;
2697   default:
2698     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699   }
2700   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701   csrmat = (CsrMatrix *)mat->mat;
2702   /* if the user passed a CPU matrix, copy the data to the GPU */
2703   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2706 
2707   PetscCall(MatDenseGetLDA(B, &blda));
2708   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711   } else {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(C, &clda));
2714   }
2715 
2716   PetscCall(PetscLogGpuTimeBegin());
2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2720   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2721     size_t mmBufferSize;
2722     if (mmdata->initialized && mmdata->Blda != blda) {
2723       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2724       mmdata->matBDescr = NULL;
2725     }
2726     if (!mmdata->matBDescr) {
2727       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2728       mmdata->Blda = blda;
2729     }
2730 
2731     if (mmdata->initialized && mmdata->Clda != clda) {
2732       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2733       mmdata->matCDescr = NULL;
2734     }
2735     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2736       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737       mmdata->Clda = clda;
2738     }
2739 
2740     if (!mat->matDescr) {
2741       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2742                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2743       PetscCallCUSPARSE(stat);
2744     }
2745     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2746     PetscCallCUSPARSE(stat);
2747     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2748       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2749       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2750       mmdata->mmBufferSize = mmBufferSize;
2751     }
2752     mmdata->initialized = PETSC_TRUE;
2753   } else {
2754     /* to be safe, always update pointers of the mats */
2755     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2756     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2757     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2758   }
2759 
2760   /* do cusparseSpMM, which supports transpose on B */
2761   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2762   PetscCallCUSPARSE(stat);
2763 #else
2764   PetscInt k;
2765   /* cusparseXcsrmm does not support transpose on B */
2766   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2767     cublasHandle_t cublasv2handle;
2768     cublasStatus_t cerr;
2769 
2770     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2771     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2772     PetscCallCUBLAS(cerr);
2773     blda = B->cmap->n;
2774     k    = B->cmap->n;
2775   } else {
2776     k = B->rmap->n;
2777   }
2778 
2779   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2780   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2781   PetscCallCUSPARSE(stat);
2782 #endif
2783   PetscCall(PetscLogGpuTimeEnd());
2784   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2785   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2786   if (product->type == MATPRODUCT_RARt) {
2787     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2788     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2789   } else if (product->type == MATPRODUCT_PtAP) {
2790     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2792   } else {
2793     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2794   }
2795   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2796   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2797   PetscFunctionReturn(PETSC_SUCCESS);
2798 }
2799 
2800 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2801 {
2802   Mat_Product        *product = C->product;
2803   Mat                 A, B;
2804   PetscInt            m, n;
2805   PetscBool           cisdense, flg;
2806   MatMatCusparse     *mmdata;
2807   Mat_SeqAIJCUSPARSE *cusp;
2808 
2809   PetscFunctionBegin;
2810   MatCheckProduct(C, 1);
2811   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2812   A = product->A;
2813   B = product->B;
2814   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2815   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2816   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2817   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2818   switch (product->type) {
2819   case MATPRODUCT_AB:
2820     m = A->rmap->n;
2821     n = B->cmap->n;
2822     break;
2823   case MATPRODUCT_AtB:
2824     m = A->cmap->n;
2825     n = B->cmap->n;
2826     break;
2827   case MATPRODUCT_ABt:
2828     m = A->rmap->n;
2829     n = B->rmap->n;
2830     break;
2831   case MATPRODUCT_PtAP:
2832     m = B->cmap->n;
2833     n = B->cmap->n;
2834     break;
2835   case MATPRODUCT_RARt:
2836     m = B->rmap->n;
2837     n = B->rmap->n;
2838     break;
2839   default:
2840     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2841   }
2842   PetscCall(MatSetSizes(C, m, n, m, n));
2843   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2844   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2845   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2846 
2847   /* product data */
2848   PetscCall(PetscNew(&mmdata));
2849   mmdata->cisdense = cisdense;
2850 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2851   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2852   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2853 #endif
2854   /* for these products we need intermediate storage */
2855   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2856     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2857     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2858     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2859       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2860     } else {
2861       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2862     }
2863   }
2864   C->product->data    = mmdata;
2865   C->product->destroy = MatDestroy_MatMatCusparse;
2866 
2867   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2868   PetscFunctionReturn(PETSC_SUCCESS);
2869 }
2870 
2871 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2872 {
2873   Mat_Product                  *product = C->product;
2874   Mat                           A, B;
2875   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2876   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2877   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2878   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2879   PetscBool                     flg;
2880   cusparseStatus_t              stat;
2881   MatProductType                ptype;
2882   MatMatCusparse               *mmdata;
2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884   cusparseSpMatDescr_t BmatSpDescr;
2885 #endif
2886   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2887 
2888   PetscFunctionBegin;
2889   MatCheckProduct(C, 1);
2890   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2891   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2892   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2893   mmdata = (MatMatCusparse *)C->product->data;
2894   A      = product->A;
2895   B      = product->B;
2896   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2897     mmdata->reusesym = PETSC_FALSE;
2898     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2899     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2900     Cmat = Ccusp->mat;
2901     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2902     Ccsr = (CsrMatrix *)Cmat->mat;
2903     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2904     goto finalize;
2905   }
2906   if (!c->nz) goto finalize;
2907   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2908   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2909   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2910   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2911   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2913   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2914   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2915   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2916   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2919   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2920   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2921 
2922   ptype = product->type;
2923   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2924     ptype = MATPRODUCT_AB;
2925     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2926   }
2927   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2928     ptype = MATPRODUCT_AB;
2929     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2930   }
2931   switch (ptype) {
2932   case MATPRODUCT_AB:
2933     Amat = Acusp->mat;
2934     Bmat = Bcusp->mat;
2935     break;
2936   case MATPRODUCT_AtB:
2937     Amat = Acusp->matTranspose;
2938     Bmat = Bcusp->mat;
2939     break;
2940   case MATPRODUCT_ABt:
2941     Amat = Acusp->mat;
2942     Bmat = Bcusp->matTranspose;
2943     break;
2944   default:
2945     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2946   }
2947   Cmat = Ccusp->mat;
2948   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2949   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2950   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2951   Acsr = (CsrMatrix *)Amat->mat;
2952   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2953   Ccsr = (CsrMatrix *)Cmat->mat;
2954   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2955   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2956   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957   PetscCall(PetscLogGpuTimeBegin());
2958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2959   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2960   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2961   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2962   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2963   PetscCallCUSPARSE(stat);
2964   #else
2965   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2966   PetscCallCUSPARSE(stat);
2967   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2968   PetscCallCUSPARSE(stat);
2969   #endif
2970 #else
2971   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2972                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2973   PetscCallCUSPARSE(stat);
2974 #endif
2975   PetscCall(PetscLogGpuFlops(mmdata->flops));
2976   PetscCallCUDA(WaitForCUDA());
2977   PetscCall(PetscLogGpuTimeEnd());
2978   C->offloadmask = PETSC_OFFLOAD_GPU;
2979 finalize:
2980   /* shorter version of MatAssemblyEnd_SeqAIJ */
2981   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2982   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2983   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2984   c->reallocs = 0;
2985   C->info.mallocs += 0;
2986   C->info.nz_unneeded = 0;
2987   C->assembled = C->was_assembled = PETSC_TRUE;
2988   C->num_ass++;
2989   PetscFunctionReturn(PETSC_SUCCESS);
2990 }
2991 
2992 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2993 {
2994   Mat_Product                  *product = C->product;
2995   Mat                           A, B;
2996   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2997   Mat_SeqAIJ                   *a, *b, *c;
2998   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2999   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3000   PetscInt                      i, j, m, n, k;
3001   PetscBool                     flg;
3002   cusparseStatus_t              stat;
3003   MatProductType                ptype;
3004   MatMatCusparse               *mmdata;
3005   PetscLogDouble                flops;
3006   PetscBool                     biscompressed, ciscompressed;
3007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3008   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3009   cusparseSpMatDescr_t BmatSpDescr;
3010 #else
3011   int cnz;
3012 #endif
3013   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3014 
3015   PetscFunctionBegin;
3016   MatCheckProduct(C, 1);
3017   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3018   A = product->A;
3019   B = product->B;
3020   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3021   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3022   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3023   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3024   a = (Mat_SeqAIJ *)A->data;
3025   b = (Mat_SeqAIJ *)B->data;
3026   /* product data */
3027   PetscCall(PetscNew(&mmdata));
3028   C->product->data    = mmdata;
3029   C->product->destroy = MatDestroy_MatMatCusparse;
3030 
3031   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3032   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3033   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3034   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3035   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3037 
3038   ptype = product->type;
3039   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3040     ptype                                          = MATPRODUCT_AB;
3041     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3042   }
3043   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3044     ptype                                          = MATPRODUCT_AB;
3045     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3046   }
3047   biscompressed = PETSC_FALSE;
3048   ciscompressed = PETSC_FALSE;
3049   switch (ptype) {
3050   case MATPRODUCT_AB:
3051     m    = A->rmap->n;
3052     n    = B->cmap->n;
3053     k    = A->cmap->n;
3054     Amat = Acusp->mat;
3055     Bmat = Bcusp->mat;
3056     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3057     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3058     break;
3059   case MATPRODUCT_AtB:
3060     m = A->cmap->n;
3061     n = B->cmap->n;
3062     k = A->rmap->n;
3063     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3064     Amat = Acusp->matTranspose;
3065     Bmat = Bcusp->mat;
3066     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3067     break;
3068   case MATPRODUCT_ABt:
3069     m = A->rmap->n;
3070     n = B->rmap->n;
3071     k = A->cmap->n;
3072     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3073     Amat = Acusp->mat;
3074     Bmat = Bcusp->matTranspose;
3075     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3076     break;
3077   default:
3078     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3079   }
3080 
3081   /* create cusparse matrix */
3082   PetscCall(MatSetSizes(C, m, n, m, n));
3083   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3084   c     = (Mat_SeqAIJ *)C->data;
3085   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3086   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3087   Ccsr  = new CsrMatrix;
3088 
3089   c->compressedrow.use = ciscompressed;
3090   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3091     c->compressedrow.nrows = a->compressedrow.nrows;
3092     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3093     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3094     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3095     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3096     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3097   } else {
3098     c->compressedrow.nrows  = 0;
3099     c->compressedrow.i      = NULL;
3100     c->compressedrow.rindex = NULL;
3101     Ccusp->workVector       = NULL;
3102     Cmat->cprowIndices      = NULL;
3103   }
3104   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3105   Ccusp->mat        = Cmat;
3106   Ccusp->mat->mat   = Ccsr;
3107   Ccsr->num_rows    = Ccusp->nrows;
3108   Ccsr->num_cols    = n;
3109   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3110   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3111   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3112   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3113   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3114   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3115   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3116   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3119   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3120     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3121     c->nz                = 0;
3122     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3123     Ccsr->values         = new THRUSTARRAY(c->nz);
3124     goto finalizesym;
3125   }
3126 
3127   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3128   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3129   Acsr = (CsrMatrix *)Amat->mat;
3130   if (!biscompressed) {
3131     Bcsr = (CsrMatrix *)Bmat->mat;
3132 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3133     BmatSpDescr = Bmat->matDescr;
3134 #endif
3135   } else { /* we need to use row offsets for the full matrix */
3136     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3137     Bcsr                 = new CsrMatrix;
3138     Bcsr->num_rows       = B->rmap->n;
3139     Bcsr->num_cols       = cBcsr->num_cols;
3140     Bcsr->num_entries    = cBcsr->num_entries;
3141     Bcsr->column_indices = cBcsr->column_indices;
3142     Bcsr->values         = cBcsr->values;
3143     if (!Bcusp->rowoffsets_gpu) {
3144       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3145       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3146       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3147     }
3148     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3149     mmdata->Bcsr      = Bcsr;
3150 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151     if (Bcsr->num_rows && Bcsr->num_cols) {
3152       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3153       PetscCallCUSPARSE(stat);
3154     }
3155     BmatSpDescr = mmdata->matSpBDescr;
3156 #endif
3157   }
3158   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3159   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3160   /* precompute flops count */
3161   if (ptype == MATPRODUCT_AB) {
3162     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3163       const PetscInt st = a->i[i];
3164       const PetscInt en = a->i[i + 1];
3165       for (j = st; j < en; j++) {
3166         const PetscInt brow = a->j[j];
3167         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3168       }
3169     }
3170   } else if (ptype == MATPRODUCT_AtB) {
3171     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3172       const PetscInt anzi = a->i[i + 1] - a->i[i];
3173       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3174       flops += (2. * anzi) * bnzi;
3175     }
3176   } else { /* TODO */
3177     flops = 0.;
3178   }
3179 
3180   mmdata->flops = flops;
3181   PetscCall(PetscLogGpuTimeBegin());
3182 
3183 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3185   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3186   PetscCallCUSPARSE(stat);
3187   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3188   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3189   {
3190     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3191      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3192   */
3193     void *dBuffer1 = NULL;
3194     void *dBuffer2 = NULL;
3195     void *dBuffer3 = NULL;
3196     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3197     size_t bufferSize1 = 0;
3198     size_t bufferSize2 = 0;
3199     size_t bufferSize3 = 0;
3200     size_t bufferSize4 = 0;
3201     size_t bufferSize5 = 0;
3202 
3203     /* ask bufferSize1 bytes for external memory */
3204     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3205     PetscCallCUSPARSE(stat);
3206     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3207     /* inspect the matrices A and B to understand the memory requirement for the next step */
3208     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3209     PetscCallCUSPARSE(stat);
3210 
3211     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3212     PetscCallCUSPARSE(stat);
3213     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3214     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3215     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3216     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3217     PetscCallCUSPARSE(stat);
3218     PetscCallCUDA(cudaFree(dBuffer1));
3219     PetscCallCUDA(cudaFree(dBuffer2));
3220 
3221     /* get matrix C non-zero entries C_nnz1 */
3222     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3223     c->nz = (PetscInt)C_nnz1;
3224     /* allocate matrix C */
3225     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3226     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3227     Ccsr->values = new THRUSTARRAY(c->nz);
3228     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3229     /* update matC with the new pointers */
3230     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3231     PetscCallCUSPARSE(stat);
3232 
3233     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3234     PetscCallCUSPARSE(stat);
3235     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3236     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3237     PetscCallCUSPARSE(stat);
3238     PetscCallCUDA(cudaFree(dBuffer3));
3239     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3240     PetscCallCUSPARSE(stat);
3241     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3242   }
3243   #else
3244   size_t bufSize2;
3245   /* ask bufferSize bytes for external memory */
3246   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3247   PetscCallCUSPARSE(stat);
3248   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3249   /* inspect the matrices A and B to understand the memory requirement for the next step */
3250   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3251   PetscCallCUSPARSE(stat);
3252   /* ask bufferSize again bytes for external memory */
3253   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3254   PetscCallCUSPARSE(stat);
3255   /* The CUSPARSE documentation is not clear, nor the API
3256      We need both buffers to perform the operations properly!
3257      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3258      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3259      is stored in the descriptor! What a messy API... */
3260   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3261   /* compute the intermediate product of A * B */
3262   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3263   PetscCallCUSPARSE(stat);
3264   /* get matrix C non-zero entries C_nnz1 */
3265   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3266   c->nz = (PetscInt)C_nnz1;
3267   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3268                       mmdata->mmBufferSize / 1024));
3269   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3270   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3271   Ccsr->values = new THRUSTARRAY(c->nz);
3272   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3273   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3274   PetscCallCUSPARSE(stat);
3275   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3276   PetscCallCUSPARSE(stat);
3277   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3278 #else
3279   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3280   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3281                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3282   PetscCallCUSPARSE(stat);
3283   c->nz                = cnz;
3284   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3285   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3286   Ccsr->values = new THRUSTARRAY(c->nz);
3287   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3288 
3289   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3290   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3291      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3292      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3293   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3294                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3295   PetscCallCUSPARSE(stat);
3296 #endif
3297   PetscCall(PetscLogGpuFlops(mmdata->flops));
3298   PetscCall(PetscLogGpuTimeEnd());
3299 finalizesym:
3300   c->singlemalloc = PETSC_FALSE;
3301   c->free_a       = PETSC_TRUE;
3302   c->free_ij      = PETSC_TRUE;
3303   PetscCall(PetscMalloc1(m + 1, &c->i));
3304   PetscCall(PetscMalloc1(c->nz, &c->j));
3305   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3306     PetscInt      *d_i = c->i;
3307     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3308     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3309     ii = *Ccsr->row_offsets;
3310     jj = *Ccsr->column_indices;
3311     if (ciscompressed) d_i = c->compressedrow.i;
3312     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3313     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3314   } else {
3315     PetscInt *d_i = c->i;
3316     if (ciscompressed) d_i = c->compressedrow.i;
3317     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3318     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3319   }
3320   if (ciscompressed) { /* need to expand host row offsets */
3321     PetscInt r = 0;
3322     c->i[0]    = 0;
3323     for (k = 0; k < c->compressedrow.nrows; k++) {
3324       const PetscInt next = c->compressedrow.rindex[k];
3325       const PetscInt old  = c->compressedrow.i[k];
3326       for (; r < next; r++) c->i[r + 1] = old;
3327     }
3328     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3329   }
3330   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3331   PetscCall(PetscMalloc1(m, &c->ilen));
3332   PetscCall(PetscMalloc1(m, &c->imax));
3333   c->maxnz         = c->nz;
3334   c->nonzerorowcnt = 0;
3335   c->rmax          = 0;
3336   for (k = 0; k < m; k++) {
3337     const PetscInt nn = c->i[k + 1] - c->i[k];
3338     c->ilen[k] = c->imax[k] = nn;
3339     c->nonzerorowcnt += (PetscInt) !!nn;
3340     c->rmax = PetscMax(c->rmax, nn);
3341   }
3342   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3343   PetscCall(PetscMalloc1(c->nz, &c->a));
3344   Ccsr->num_entries = c->nz;
3345 
3346   C->nonzerostate++;
3347   PetscCall(PetscLayoutSetUp(C->rmap));
3348   PetscCall(PetscLayoutSetUp(C->cmap));
3349   Ccusp->nonzerostate = C->nonzerostate;
3350   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3351   C->preallocated     = PETSC_TRUE;
3352   C->assembled        = PETSC_FALSE;
3353   C->was_assembled    = PETSC_FALSE;
3354   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3355     mmdata->reusesym = PETSC_TRUE;
3356     C->offloadmask   = PETSC_OFFLOAD_GPU;
3357   }
3358   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3359   PetscFunctionReturn(PETSC_SUCCESS);
3360 }
3361 
3362 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3363 
3364 /* handles sparse or dense B */
3365 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3366 {
3367   Mat_Product *product = mat->product;
3368   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3369 
3370   PetscFunctionBegin;
3371   MatCheckProduct(mat, 1);
3372   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3373   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3374   if (product->type == MATPRODUCT_ABC) {
3375     Ciscusp = PETSC_FALSE;
3376     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3377   }
3378   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3379     PetscBool usecpu = PETSC_FALSE;
3380     switch (product->type) {
3381     case MATPRODUCT_AB:
3382       if (product->api_user) {
3383         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3384         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3385         PetscOptionsEnd();
3386       } else {
3387         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3388         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3389         PetscOptionsEnd();
3390       }
3391       break;
3392     case MATPRODUCT_AtB:
3393       if (product->api_user) {
3394         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3395         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3396         PetscOptionsEnd();
3397       } else {
3398         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3399         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3400         PetscOptionsEnd();
3401       }
3402       break;
3403     case MATPRODUCT_PtAP:
3404       if (product->api_user) {
3405         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3406         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3407         PetscOptionsEnd();
3408       } else {
3409         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3410         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3411         PetscOptionsEnd();
3412       }
3413       break;
3414     case MATPRODUCT_RARt:
3415       if (product->api_user) {
3416         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3417         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3418         PetscOptionsEnd();
3419       } else {
3420         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3421         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3422         PetscOptionsEnd();
3423       }
3424       break;
3425     case MATPRODUCT_ABC:
3426       if (product->api_user) {
3427         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3428         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3429         PetscOptionsEnd();
3430       } else {
3431         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3432         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3433         PetscOptionsEnd();
3434       }
3435       break;
3436     default:
3437       break;
3438     }
3439     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3440   }
3441   /* dispatch */
3442   if (isdense) {
3443     switch (product->type) {
3444     case MATPRODUCT_AB:
3445     case MATPRODUCT_AtB:
3446     case MATPRODUCT_ABt:
3447     case MATPRODUCT_PtAP:
3448     case MATPRODUCT_RARt:
3449       if (product->A->boundtocpu) {
3450         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3451       } else {
3452         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3453       }
3454       break;
3455     case MATPRODUCT_ABC:
3456       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3457       break;
3458     default:
3459       break;
3460     }
3461   } else if (Biscusp && Ciscusp) {
3462     switch (product->type) {
3463     case MATPRODUCT_AB:
3464     case MATPRODUCT_AtB:
3465     case MATPRODUCT_ABt:
3466       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3467       break;
3468     case MATPRODUCT_PtAP:
3469     case MATPRODUCT_RARt:
3470     case MATPRODUCT_ABC:
3471       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3472       break;
3473     default:
3474       break;
3475     }
3476   } else { /* fallback for AIJ */
3477     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3478   }
3479   PetscFunctionReturn(PETSC_SUCCESS);
3480 }
3481 
3482 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3483 {
3484   PetscFunctionBegin;
3485   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3486   PetscFunctionReturn(PETSC_SUCCESS);
3487 }
3488 
3489 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3490 {
3491   PetscFunctionBegin;
3492   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3493   PetscFunctionReturn(PETSC_SUCCESS);
3494 }
3495 
3496 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3497 {
3498   PetscFunctionBegin;
3499   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3500   PetscFunctionReturn(PETSC_SUCCESS);
3501 }
3502 
3503 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3504 {
3505   PetscFunctionBegin;
3506   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3507   PetscFunctionReturn(PETSC_SUCCESS);
3508 }
3509 
3510 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3511 {
3512   PetscFunctionBegin;
3513   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3514   PetscFunctionReturn(PETSC_SUCCESS);
3515 }
3516 
3517 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3518 {
3519   int i = blockIdx.x * blockDim.x + threadIdx.x;
3520   if (i < n) y[idx[i]] += x[i];
3521 }
3522 
3523 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3524 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3525 {
3526   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3527   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3528   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3529   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3530   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3531   PetscBool                     compressed;
3532 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3533   PetscInt nx, ny;
3534 #endif
3535 
3536   PetscFunctionBegin;
3537   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3538   if (!a->nz) {
3539     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3540     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3541     PetscFunctionReturn(PETSC_SUCCESS);
3542   }
3543   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3544   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3545   if (!trans) {
3546     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3547     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3548   } else {
3549     if (herm || !A->form_explicit_transpose) {
3550       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3551       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3552     } else {
3553       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3554       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3555     }
3556   }
3557   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3558   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3559 
3560   try {
3561     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3562     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3563     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3564 
3565     PetscCall(PetscLogGpuTimeBegin());
3566     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3567       /* z = A x + beta y.
3568          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3569          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3570       */
3571       xptr = xarray;
3572       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3573       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3574 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3575       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3576           allocated to accommodate different uses. So we get the length info directly from mat.
3577        */
3578       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3579         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3580         nx             = mat->num_cols;
3581         ny             = mat->num_rows;
3582       }
3583 #endif
3584     } else {
3585       /* z = A^T x + beta y
3586          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3587          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3588        */
3589       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3590       dptr = zarray;
3591       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3592       if (compressed) { /* Scatter x to work vector */
3593         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3594 
3595         thrust::for_each(
3596 #if PetscDefined(HAVE_THRUST_ASYNC)
3597           thrust::cuda::par.on(PetscDefaultCudaStream),
3598 #endif
3599           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3600           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3601       }
3602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3603       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3604         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3605         nx             = mat->num_rows;
3606         ny             = mat->num_cols;
3607       }
3608 #endif
3609     }
3610 
3611     /* csr_spmv does y = alpha op(A) x + beta y */
3612     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3613 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3614       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3615       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3616         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3617         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3618         PetscCallCUSPARSE(
3619           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3620         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3621 
3622         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3623       } else {
3624         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3625         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3626         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3627       }
3628 
3629       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3630                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3631 #else
3632       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3633       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3634 #endif
3635     } else {
3636       if (cusparsestruct->nrows) {
3637 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3638         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3639 #else
3640         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3641         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3642 #endif
3643       }
3644     }
3645     PetscCall(PetscLogGpuTimeEnd());
3646 
3647     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3648       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3649         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3650           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3651         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3652           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3653         }
3654       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3655         PetscCall(VecSeq_CUDA::Set(zz, 0));
3656       }
3657 
3658       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3659       if (compressed) {
3660         PetscCall(PetscLogGpuTimeBegin());
3661         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3662            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3663            prevent that. So I just add a ScatterAdd kernel.
3664          */
3665 #if 0
3666         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3667         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3668                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3669                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3670                          VecCUDAPlusEquals());
3671 #else
3672         PetscInt n = matstruct->cprowIndices->size();
3673         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3674 #endif
3675         PetscCall(PetscLogGpuTimeEnd());
3676       }
3677     } else {
3678       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3679     }
3680     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3681     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3682     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3683   } catch (char *ex) {
3684     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3685   }
3686   if (yy) {
3687     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3688   } else {
3689     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3690   }
3691   PetscFunctionReturn(PETSC_SUCCESS);
3692 }
3693 
3694 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3695 {
3696   PetscFunctionBegin;
3697   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3698   PetscFunctionReturn(PETSC_SUCCESS);
3699 }
3700 
3701 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3702 {
3703   PetscFunctionBegin;
3704   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3705   PetscFunctionReturn(PETSC_SUCCESS);
3706 }
3707 
3708 /*@
3709    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3710    (the default parallel PETSc format). This matrix will ultimately pushed down
3711    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3712    assembly performance the user should preallocate the matrix storage by setting
3713    the parameter `nz` (or the array `nnz`).
3714 
3715    Collective
3716 
3717    Input Parameters:
3718 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3719 .  m - number of rows
3720 .  n - number of columns
3721 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3722 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3723 
3724    Output Parameter:
3725 .  A - the matrix
3726 
3727    Level: intermediate
3728 
3729    Notes:
3730    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3731    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3732    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3733 
3734    The AIJ format, also called
3735    compressed row storage, is fully compatible with standard Fortran
3736    storage.  That is, the stored row and column indices can begin at
3737    either one (as in Fortran) or zero.
3738 
3739    Specify the preallocated storage with either nz or nnz (not both).
3740    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3741    allocation.
3742 
3743 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3744 @*/
3745 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3746 {
3747   PetscFunctionBegin;
3748   PetscCall(MatCreate(comm, A));
3749   PetscCall(MatSetSizes(*A, m, n, m, n));
3750   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3751   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3752   PetscFunctionReturn(PETSC_SUCCESS);
3753 }
3754 
3755 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3756 {
3757   PetscFunctionBegin;
3758   if (A->factortype == MAT_FACTOR_NONE) {
3759     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3760   } else {
3761     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3762   }
3763   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3764   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3765   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3766   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3767   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3768   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3769   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3770   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3771   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3772   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3773   PetscCall(MatDestroy_SeqAIJ(A));
3774   PetscFunctionReturn(PETSC_SUCCESS);
3775 }
3776 
3777 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3778 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3779 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3780 {
3781   PetscFunctionBegin;
3782   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3783   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3784   PetscFunctionReturn(PETSC_SUCCESS);
3785 }
3786 
3787 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3788 {
3789   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3790   Mat_SeqAIJCUSPARSE *cy;
3791   Mat_SeqAIJCUSPARSE *cx;
3792   PetscScalar        *ay;
3793   const PetscScalar  *ax;
3794   CsrMatrix          *csry, *csrx;
3795 
3796   PetscFunctionBegin;
3797   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3798   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3799   if (X->ops->axpy != Y->ops->axpy) {
3800     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3801     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3802     PetscFunctionReturn(PETSC_SUCCESS);
3803   }
3804   /* if we are here, it means both matrices are bound to GPU */
3805   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3806   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3807   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3808   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3809   csry = (CsrMatrix *)cy->mat->mat;
3810   csrx = (CsrMatrix *)cx->mat->mat;
3811   /* see if we can turn this into a cublas axpy */
3812   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3813     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3814     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3815     if (eq) str = SAME_NONZERO_PATTERN;
3816   }
3817   /* spgeam is buggy with one column */
3818   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3819 
3820   if (str == SUBSET_NONZERO_PATTERN) {
3821     PetscScalar b = 1.0;
3822 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3823     size_t bufferSize;
3824     void  *buffer;
3825 #endif
3826 
3827     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3828     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3829     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3830 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3831     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3832                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3833     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3834     PetscCall(PetscLogGpuTimeBegin());
3835     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3836                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3837     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3838     PetscCall(PetscLogGpuTimeEnd());
3839     PetscCallCUDA(cudaFree(buffer));
3840 #else
3841     PetscCall(PetscLogGpuTimeBegin());
3842     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3843                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3844     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3845     PetscCall(PetscLogGpuTimeEnd());
3846 #endif
3847     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3848     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3849     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3850     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3851   } else if (str == SAME_NONZERO_PATTERN) {
3852     cublasHandle_t cublasv2handle;
3853     PetscBLASInt   one = 1, bnz = 1;
3854 
3855     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3856     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3857     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3858     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3859     PetscCall(PetscLogGpuTimeBegin());
3860     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3861     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3862     PetscCall(PetscLogGpuTimeEnd());
3863     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3864     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3865     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3866   } else {
3867     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3868     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3869   }
3870   PetscFunctionReturn(PETSC_SUCCESS);
3871 }
3872 
3873 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3874 {
3875   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3876   PetscScalar   *ay;
3877   cublasHandle_t cublasv2handle;
3878   PetscBLASInt   one = 1, bnz = 1;
3879 
3880   PetscFunctionBegin;
3881   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3882   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3883   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3884   PetscCall(PetscLogGpuTimeBegin());
3885   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3886   PetscCall(PetscLogGpuFlops(bnz));
3887   PetscCall(PetscLogGpuTimeEnd());
3888   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3889   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3890   PetscFunctionReturn(PETSC_SUCCESS);
3891 }
3892 
3893 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3894 {
3895   PetscBool   both = PETSC_FALSE;
3896   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3897 
3898   PetscFunctionBegin;
3899   if (A->factortype == MAT_FACTOR_NONE) {
3900     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3901     if (spptr->mat) {
3902       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3903       if (matrix->values) {
3904         both = PETSC_TRUE;
3905         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3906       }
3907     }
3908     if (spptr->matTranspose) {
3909       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3910       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3911     }
3912   }
3913   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3914   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3915   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3916   else A->offloadmask = PETSC_OFFLOAD_CPU;
3917   PetscFunctionReturn(PETSC_SUCCESS);
3918 }
3919 
3920 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3921 {
3922   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3923 
3924   PetscFunctionBegin;
3925   if (A->factortype != MAT_FACTOR_NONE) {
3926     A->boundtocpu = flg;
3927     PetscFunctionReturn(PETSC_SUCCESS);
3928   }
3929   if (flg) {
3930     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3931 
3932     A->ops->scale                     = MatScale_SeqAIJ;
3933     A->ops->axpy                      = MatAXPY_SeqAIJ;
3934     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3935     A->ops->mult                      = MatMult_SeqAIJ;
3936     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3937     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3938     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3939     A->ops->multhermitiantranspose    = NULL;
3940     A->ops->multhermitiantransposeadd = NULL;
3941     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3942     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3943     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3944     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3945     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3946     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3947     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3948     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3949   } else {
3950     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3951     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3952     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3953     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3954     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3955     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3956     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3957     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3958     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3959     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3960     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3961     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3962     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3963     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3964     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3965     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3966     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3967 
3968     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3969     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3970     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3971     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3972     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3973     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3974   }
3975   A->boundtocpu = flg;
3976   if (flg && a->inode.size) {
3977     a->inode.use = PETSC_TRUE;
3978   } else {
3979     a->inode.use = PETSC_FALSE;
3980   }
3981   PetscFunctionReturn(PETSC_SUCCESS);
3982 }
3983 
3984 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3985 {
3986   Mat B;
3987 
3988   PetscFunctionBegin;
3989   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3990   if (reuse == MAT_INITIAL_MATRIX) {
3991     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3992   } else if (reuse == MAT_REUSE_MATRIX) {
3993     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3994   }
3995   B = *newmat;
3996 
3997   PetscCall(PetscFree(B->defaultvectype));
3998   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3999 
4000   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4001     if (B->factortype == MAT_FACTOR_NONE) {
4002       Mat_SeqAIJCUSPARSE *spptr;
4003       PetscCall(PetscNew(&spptr));
4004       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4005       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4006       spptr->format = MAT_CUSPARSE_CSR;
4007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4008   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4009       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4010   #else
4011       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4012   #endif
4013       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4014       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4015 #endif
4016       B->spptr = spptr;
4017     } else {
4018       Mat_SeqAIJCUSPARSETriFactors *spptr;
4019 
4020       PetscCall(PetscNew(&spptr));
4021       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4022       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4023       B->spptr = spptr;
4024     }
4025     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4026   }
4027   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4028   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4029   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4030   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4031   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4032   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4033 
4034   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4035   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4036   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4037 #if defined(PETSC_HAVE_HYPRE)
4038   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4039 #endif
4040   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4041   PetscFunctionReturn(PETSC_SUCCESS);
4042 }
4043 
4044 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4045 {
4046   PetscFunctionBegin;
4047   PetscCall(MatCreate_SeqAIJ(B));
4048   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4049   PetscFunctionReturn(PETSC_SUCCESS);
4050 }
4051 
4052 /*MC
4053    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4054 
4055    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4056    CSR, ELL, or Hybrid format.
4057    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4058 
4059    Options Database Keys:
4060 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4061 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4062                                       Other options include ell (ellpack) or hyb (hybrid).
4063 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4064 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4065 
4066   Level: beginner
4067 
4068 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4069 M*/
4070 
4071 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4072 {
4073   PetscFunctionBegin;
4074   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4075   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4076   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4077   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4078 
4079   PetscFunctionReturn(PETSC_SUCCESS);
4080 }
4081 
4082 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4083 {
4084   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4085 
4086   PetscFunctionBegin;
4087   if (cusp) {
4088     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4089     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4090     delete cusp->workVector;
4091     delete cusp->rowoffsets_gpu;
4092     delete cusp->csr2csc_i;
4093     delete cusp->coords;
4094     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4095     PetscCall(PetscFree(mat->spptr));
4096   }
4097   PetscFunctionReturn(PETSC_SUCCESS);
4098 }
4099 
4100 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4101 {
4102   PetscFunctionBegin;
4103   if (*mat) {
4104     delete (*mat)->values;
4105     delete (*mat)->column_indices;
4106     delete (*mat)->row_offsets;
4107     delete *mat;
4108     *mat = 0;
4109   }
4110   PetscFunctionReturn(PETSC_SUCCESS);
4111 }
4112 
4113 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4114 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4115 {
4116   PetscFunctionBegin;
4117   if (*trifactor) {
4118     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4119     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4120     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4121     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4122     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4123   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4124     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4125   #endif
4126     PetscCall(PetscFree(*trifactor));
4127   }
4128   PetscFunctionReturn(PETSC_SUCCESS);
4129 }
4130 #endif
4131 
4132 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4133 {
4134   CsrMatrix *mat;
4135 
4136   PetscFunctionBegin;
4137   if (*matstruct) {
4138     if ((*matstruct)->mat) {
4139       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4140 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4141         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4142 #else
4143         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4144         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4145 #endif
4146       } else {
4147         mat = (CsrMatrix *)(*matstruct)->mat;
4148         PetscCall(CsrMatrix_Destroy(&mat));
4149       }
4150     }
4151     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4152     delete (*matstruct)->cprowIndices;
4153     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4154     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4155     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4156 
4157 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4158     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4159     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4160     for (int i = 0; i < 3; i++) {
4161       if (mdata->cuSpMV[i].initialized) {
4162         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4163         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4164         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4165       }
4166     }
4167 #endif
4168     delete *matstruct;
4169     *matstruct = NULL;
4170   }
4171   PetscFunctionReturn(PETSC_SUCCESS);
4172 }
4173 
4174 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4175 {
4176   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4177 
4178   PetscFunctionBegin;
4179   if (fs) {
4180 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4181     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4182     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4183     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4184     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4185     delete fs->workVector;
4186     fs->workVector = NULL;
4187 #endif
4188     delete fs->rpermIndices;
4189     delete fs->cpermIndices;
4190     fs->rpermIndices  = NULL;
4191     fs->cpermIndices  = NULL;
4192     fs->init_dev_prop = PETSC_FALSE;
4193 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4194     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4195     PetscCallCUDA(cudaFree(fs->csrColIdx));
4196     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4197     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4198     PetscCallCUDA(cudaFree(fs->csrVal));
4199     PetscCallCUDA(cudaFree(fs->diag));
4200     PetscCallCUDA(cudaFree(fs->X));
4201     PetscCallCUDA(cudaFree(fs->Y));
4202     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4203     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4204     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4205     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4206     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4207     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4208     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4209     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4210     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4211     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4212     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4213     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4214     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4215     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4216     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4217     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4218     PetscCall(PetscFree(fs->csrRowPtr_h));
4219     PetscCall(PetscFree(fs->csrVal_h));
4220     PetscCall(PetscFree(fs->diag_h));
4221     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4222     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4223 #endif
4224   }
4225   PetscFunctionReturn(PETSC_SUCCESS);
4226 }
4227 
4228 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4229 {
4230   PetscFunctionBegin;
4231   if (*trifactors) {
4232     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4233     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4234     PetscCall(PetscFree(*trifactors));
4235   }
4236   PetscFunctionReturn(PETSC_SUCCESS);
4237 }
4238 
4239 struct IJCompare {
4240   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4241   {
4242     if (t1.get<0>() < t2.get<0>()) return true;
4243     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4244     return false;
4245   }
4246 };
4247 
4248 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4249 {
4250   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4251 
4252   PetscFunctionBegin;
4253   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4254   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4255   if (destroy) {
4256     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4257     delete cusp->csr2csc_i;
4258     cusp->csr2csc_i = NULL;
4259   }
4260   A->transupdated = PETSC_FALSE;
4261   PetscFunctionReturn(PETSC_SUCCESS);
4262 }
4263 
4264 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4265 {
4266   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
4267   PetscFunctionBegin;
4268   PetscCallCUDA(cudaFree(coo->perm));
4269   PetscCallCUDA(cudaFree(coo->jmap));
4270   PetscCall(PetscFree(coo));
4271   PetscFunctionReturn(PETSC_SUCCESS);
4272 }
4273 
4274 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4275 {
4276   PetscBool            dev_ij = PETSC_FALSE;
4277   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4278   PetscInt            *i, *j;
4279   PetscContainer       container_h, container_d;
4280   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4281 
4282   PetscFunctionBegin;
4283   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
4284   PetscCall(PetscGetMemType(coo_i, &mtype));
4285   if (PetscMemTypeDevice(mtype)) {
4286     dev_ij = PETSC_TRUE;
4287     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4288     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4289     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4290   } else {
4291     i = coo_i;
4292     j = coo_j;
4293   }
4294 
4295   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4296   if (dev_ij) PetscCall(PetscFree2(i, j));
4297   mat->offloadmask = PETSC_OFFLOAD_CPU;
4298   // Create the GPU memory
4299   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4300 
4301   // Copy the COO struct to device
4302   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4303   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4304   PetscCall(PetscMalloc1(1, &coo_d));
4305   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4306   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4307   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4308   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4309   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4310 
4311   // Put the COO struct in a container and then attach that to the matrix
4312   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
4313   PetscCall(PetscContainerSetPointer(container_d, coo_d));
4314   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4315   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
4316   PetscCall(PetscContainerDestroy(&container_d));
4317   PetscFunctionReturn(PETSC_SUCCESS);
4318 }
4319 
4320 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4321 {
4322   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4323   const PetscCount grid_size = gridDim.x * blockDim.x;
4324   for (; i < nnz; i += grid_size) {
4325     PetscScalar sum = 0.0;
4326     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4327     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4328   }
4329 }
4330 
4331 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4332 {
4333   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4334   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4335   PetscCount           Annz = seq->nz;
4336   PetscMemType         memtype;
4337   const PetscScalar   *v1 = v;
4338   PetscScalar         *Aa;
4339   PetscContainer       container;
4340   MatCOOStruct_SeqAIJ *coo;
4341 
4342   PetscFunctionBegin;
4343   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4344 
4345   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4346   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4347 
4348   PetscCall(PetscGetMemType(v, &memtype));
4349   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4350     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4351     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4352   }
4353 
4354   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4355   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4356 
4357   if (Annz) {
4358     MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4359     PetscCallCUDA(cudaPeekAtLastError());
4360   }
4361 
4362   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4363   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4364 
4365   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4366   PetscFunctionReturn(PETSC_SUCCESS);
4367 }
4368 
4369 /*@C
4370     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4371 
4372    Not Collective
4373 
4374     Input Parameters:
4375 +   A - the matrix
4376 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4377 
4378     Output Parameters:
4379 +   i - the CSR row pointers
4380 -   j - the CSR column indices
4381 
4382     Level: developer
4383 
4384     Note:
4385       When compressed is true, the CSR structure does not contain empty rows
4386 
4387 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4388 @*/
4389 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4390 {
4391   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4392   CsrMatrix          *csr;
4393   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4394 
4395   PetscFunctionBegin;
4396   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4397   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4398   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4399   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4400   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4401   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4402   csr = (CsrMatrix *)cusp->mat->mat;
4403   if (i) {
4404     if (!compressed && a->compressedrow.use) { /* need full row offset */
4405       if (!cusp->rowoffsets_gpu) {
4406         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4407         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4408         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4409       }
4410       *i = cusp->rowoffsets_gpu->data().get();
4411     } else *i = csr->row_offsets->data().get();
4412   }
4413   if (j) *j = csr->column_indices->data().get();
4414   PetscFunctionReturn(PETSC_SUCCESS);
4415 }
4416 
4417 /*@C
4418     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4419 
4420    Not Collective
4421 
4422     Input Parameters:
4423 +   A - the matrix
4424 .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4425 .   i - the CSR row pointers
4426 -   j - the CSR column indices
4427 
4428     Level: developer
4429 
4430 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4431 @*/
4432 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4433 {
4434   PetscFunctionBegin;
4435   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4436   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4437   if (i) *i = NULL;
4438   if (j) *j = NULL;
4439   (void)compressed;
4440   PetscFunctionReturn(PETSC_SUCCESS);
4441 }
4442 
4443 /*@C
4444    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4445 
4446    Not Collective
4447 
4448    Input Parameter:
4449 .   A - a `MATSEQAIJCUSPARSE` matrix
4450 
4451    Output Parameter:
4452 .   a - pointer to the device data
4453 
4454    Level: developer
4455 
4456    Note:
4457    May trigger host-device copies if up-to-date matrix data is on host
4458 
4459 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4460 @*/
4461 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4462 {
4463   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4464   CsrMatrix          *csr;
4465 
4466   PetscFunctionBegin;
4467   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4468   PetscValidPointer(a, 2);
4469   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4471   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4472   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473   csr = (CsrMatrix *)cusp->mat->mat;
4474   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4475   *a = csr->values->data().get();
4476   PetscFunctionReturn(PETSC_SUCCESS);
4477 }
4478 
4479 /*@C
4480    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4481 
4482    Not Collective
4483 
4484    Input Parameters:
4485 +   A - a `MATSEQAIJCUSPARSE` matrix
4486 -   a - pointer to the device data
4487 
4488    Level: developer
4489 
4490 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4491 @*/
4492 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4493 {
4494   PetscFunctionBegin;
4495   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4496   PetscValidPointer(a, 2);
4497   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4498   *a = NULL;
4499   PetscFunctionReturn(PETSC_SUCCESS);
4500 }
4501 
4502 /*@C
4503    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4504 
4505    Not Collective
4506 
4507    Input Parameter:
4508 .   A - a `MATSEQAIJCUSPARSE` matrix
4509 
4510    Output Parameter:
4511 .   a - pointer to the device data
4512 
4513    Level: developer
4514 
4515    Note:
4516    May trigger host-device copies if up-to-date matrix data is on host
4517 
4518 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4519 @*/
4520 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4521 {
4522   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4523   CsrMatrix          *csr;
4524 
4525   PetscFunctionBegin;
4526   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4527   PetscValidPointer(a, 2);
4528   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4529   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4530   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4531   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4532   csr = (CsrMatrix *)cusp->mat->mat;
4533   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4534   *a             = csr->values->data().get();
4535   A->offloadmask = PETSC_OFFLOAD_GPU;
4536   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4537   PetscFunctionReturn(PETSC_SUCCESS);
4538 }
4539 /*@C
4540    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4541 
4542    Not Collective
4543 
4544    Input Parameters:
4545 +   A - a `MATSEQAIJCUSPARSE` matrix
4546 -   a - pointer to the device data
4547 
4548    Level: developer
4549 
4550 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4551 @*/
4552 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4553 {
4554   PetscFunctionBegin;
4555   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4556   PetscValidPointer(a, 2);
4557   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4558   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4559   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4560   *a = NULL;
4561   PetscFunctionReturn(PETSC_SUCCESS);
4562 }
4563 
4564 /*@C
4565    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4566 
4567    Not Collective
4568 
4569    Input Parameter:
4570 .   A - a `MATSEQAIJCUSPARSE` matrix
4571 
4572    Output Parameter:
4573 .   a - pointer to the device data
4574 
4575    Level: developer
4576 
4577    Note:
4578    Does not trigger host-device copies and flags data validity on the GPU
4579 
4580 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4581 @*/
4582 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4583 {
4584   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4585   CsrMatrix          *csr;
4586 
4587   PetscFunctionBegin;
4588   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4589   PetscValidPointer(a, 2);
4590   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4591   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4592   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593   csr = (CsrMatrix *)cusp->mat->mat;
4594   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595   *a             = csr->values->data().get();
4596   A->offloadmask = PETSC_OFFLOAD_GPU;
4597   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4598   PetscFunctionReturn(PETSC_SUCCESS);
4599 }
4600 
4601 /*@C
4602    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4603 
4604    Not Collective
4605 
4606    Input Parameters:
4607 +   A - a `MATSEQAIJCUSPARSE` matrix
4608 -   a - pointer to the device data
4609 
4610    Level: developer
4611 
4612 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4613 @*/
4614 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4615 {
4616   PetscFunctionBegin;
4617   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4618   PetscValidPointer(a, 2);
4619   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4620   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4621   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4622   *a = NULL;
4623   PetscFunctionReturn(PETSC_SUCCESS);
4624 }
4625 
4626 struct IJCompare4 {
4627   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4628   {
4629     if (t1.get<0>() < t2.get<0>()) return true;
4630     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4631     return false;
4632   }
4633 };
4634 
4635 struct Shift {
4636   int _shift;
4637 
4638   Shift(int shift) : _shift(shift) { }
4639   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4640 };
4641 
4642 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4643 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4644 {
4645   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4646   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4647   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4648   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4649   PetscInt                      Annz, Bnnz;
4650   cusparseStatus_t              stat;
4651   PetscInt                      i, m, n, zero = 0;
4652 
4653   PetscFunctionBegin;
4654   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4655   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4656   PetscValidPointer(C, 4);
4657   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4658   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4659   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4660   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4661   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4662   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4663   if (reuse == MAT_INITIAL_MATRIX) {
4664     m = A->rmap->n;
4665     n = A->cmap->n + B->cmap->n;
4666     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4667     PetscCall(MatSetSizes(*C, m, n, m, n));
4668     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4669     c                       = (Mat_SeqAIJ *)(*C)->data;
4670     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4671     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4672     Ccsr                    = new CsrMatrix;
4673     Cmat->cprowIndices      = NULL;
4674     c->compressedrow.use    = PETSC_FALSE;
4675     c->compressedrow.nrows  = 0;
4676     c->compressedrow.i      = NULL;
4677     c->compressedrow.rindex = NULL;
4678     Ccusp->workVector       = NULL;
4679     Ccusp->nrows            = m;
4680     Ccusp->mat              = Cmat;
4681     Ccusp->mat->mat         = Ccsr;
4682     Ccsr->num_rows          = m;
4683     Ccsr->num_cols          = n;
4684     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4685     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4686     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4687     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4688     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4689     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4690     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4691     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4692     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4693     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4694     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4695     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4696     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4697 
4698     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4699     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4700     Annz                 = (PetscInt)Acsr->column_indices->size();
4701     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4702     c->nz                = Annz + Bnnz;
4703     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4704     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4705     Ccsr->values         = new THRUSTARRAY(c->nz);
4706     Ccsr->num_entries    = c->nz;
4707     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4708     if (c->nz) {
4709       auto              Acoo = new THRUSTINTARRAY32(Annz);
4710       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4711       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4712       THRUSTINTARRAY32 *Aroff, *Broff;
4713 
4714       if (a->compressedrow.use) { /* need full row offset */
4715         if (!Acusp->rowoffsets_gpu) {
4716           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4717           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4718           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4719         }
4720         Aroff = Acusp->rowoffsets_gpu;
4721       } else Aroff = Acsr->row_offsets;
4722       if (b->compressedrow.use) { /* need full row offset */
4723         if (!Bcusp->rowoffsets_gpu) {
4724           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4725           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4726           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4727         }
4728         Broff = Bcusp->rowoffsets_gpu;
4729       } else Broff = Bcsr->row_offsets;
4730       PetscCall(PetscLogGpuTimeBegin());
4731       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4732       PetscCallCUSPARSE(stat);
4733       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4734       PetscCallCUSPARSE(stat);
4735       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4736       auto Aperm = thrust::make_constant_iterator(1);
4737       auto Bperm = thrust::make_constant_iterator(0);
4738 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4739       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4740       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4741 #else
4742       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4743       auto Bcib = Bcsr->column_indices->begin();
4744       auto Bcie = Bcsr->column_indices->end();
4745       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4746 #endif
4747       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4748       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4749       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4750       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4751       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4752       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4753       auto p1    = Ccusp->coords->begin();
4754       auto p2    = Ccusp->coords->begin();
4755       thrust::advance(p2, Annz);
4756       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4757 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4758       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4759 #endif
4760       auto cci = thrust::make_counting_iterator(zero);
4761       auto cce = thrust::make_counting_iterator(c->nz);
4762 #if 0 //Errors on SUMMIT cuda 11.1.0
4763       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4764 #else
4765       auto pred = thrust::identity<int>();
4766       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4767       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4768 #endif
4769       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4770       PetscCallCUSPARSE(stat);
4771       PetscCall(PetscLogGpuTimeEnd());
4772       delete wPerm;
4773       delete Acoo;
4774       delete Bcoo;
4775       delete Ccoo;
4776 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4777       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4778       PetscCallCUSPARSE(stat);
4779 #endif
4780       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4781         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4782         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4783         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4784         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4785         CsrMatrix                    *CcsrT = new CsrMatrix;
4786         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4787         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4788 
4789         (*C)->form_explicit_transpose = PETSC_TRUE;
4790         (*C)->transupdated            = PETSC_TRUE;
4791         Ccusp->rowoffsets_gpu         = NULL;
4792         CmatT->cprowIndices           = NULL;
4793         CmatT->mat                    = CcsrT;
4794         CcsrT->num_rows               = n;
4795         CcsrT->num_cols               = m;
4796         CcsrT->num_entries            = c->nz;
4797 
4798         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4799         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4800         CcsrT->values         = new THRUSTARRAY(c->nz);
4801 
4802         PetscCall(PetscLogGpuTimeBegin());
4803         auto rT = CcsrT->row_offsets->begin();
4804         if (AT) {
4805           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4806           thrust::advance(rT, -1);
4807         }
4808         if (BT) {
4809           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4810           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4811           thrust::copy(titb, tite, rT);
4812         }
4813         auto cT = CcsrT->column_indices->begin();
4814         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4815         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4816         auto vT = CcsrT->values->begin();
4817         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4818         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4819         PetscCall(PetscLogGpuTimeEnd());
4820 
4821         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4822         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4823         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4824         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4825         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4826         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4827         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4828         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4829         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4830 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4831         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4832         PetscCallCUSPARSE(stat);
4833 #endif
4834         Ccusp->matTranspose = CmatT;
4835       }
4836     }
4837 
4838     c->singlemalloc = PETSC_FALSE;
4839     c->free_a       = PETSC_TRUE;
4840     c->free_ij      = PETSC_TRUE;
4841     PetscCall(PetscMalloc1(m + 1, &c->i));
4842     PetscCall(PetscMalloc1(c->nz, &c->j));
4843     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4844       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4845       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4846       ii = *Ccsr->row_offsets;
4847       jj = *Ccsr->column_indices;
4848       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4849       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4850     } else {
4851       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4852       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4853     }
4854     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4855     PetscCall(PetscMalloc1(m, &c->ilen));
4856     PetscCall(PetscMalloc1(m, &c->imax));
4857     c->maxnz         = c->nz;
4858     c->nonzerorowcnt = 0;
4859     c->rmax          = 0;
4860     for (i = 0; i < m; i++) {
4861       const PetscInt nn = c->i[i + 1] - c->i[i];
4862       c->ilen[i] = c->imax[i] = nn;
4863       c->nonzerorowcnt += (PetscInt) !!nn;
4864       c->rmax = PetscMax(c->rmax, nn);
4865     }
4866     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4867     PetscCall(PetscMalloc1(c->nz, &c->a));
4868     (*C)->nonzerostate++;
4869     PetscCall(PetscLayoutSetUp((*C)->rmap));
4870     PetscCall(PetscLayoutSetUp((*C)->cmap));
4871     Ccusp->nonzerostate = (*C)->nonzerostate;
4872     (*C)->preallocated  = PETSC_TRUE;
4873   } else {
4874     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4875     c = (Mat_SeqAIJ *)(*C)->data;
4876     if (c->nz) {
4877       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4878       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4879       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4880       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4881       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4882       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4883       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4884       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4885       Acsr = (CsrMatrix *)Acusp->mat->mat;
4886       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4887       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4888       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4889       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4890       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4891       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4892       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4893       auto pmid = Ccusp->coords->begin();
4894       thrust::advance(pmid, Acsr->num_entries);
4895       PetscCall(PetscLogGpuTimeBegin());
4896       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4897       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4898       thrust::for_each(zibait, zieait, VecCUDAEquals());
4899       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4900       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4901       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4902       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4903       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4904         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4905         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4906         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4907         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4908         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4909         auto       vT    = CcsrT->values->begin();
4910         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4911         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4912         (*C)->transupdated = PETSC_TRUE;
4913       }
4914       PetscCall(PetscLogGpuTimeEnd());
4915     }
4916   }
4917   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4918   (*C)->assembled     = PETSC_TRUE;
4919   (*C)->was_assembled = PETSC_FALSE;
4920   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4921   PetscFunctionReturn(PETSC_SUCCESS);
4922 }
4923 
4924 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4925 {
4926   bool               dmem;
4927   const PetscScalar *av;
4928 
4929   PetscFunctionBegin;
4930   dmem = isCudaMem(v);
4931   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4932   if (n && idx) {
4933     THRUSTINTARRAY widx(n);
4934     widx.assign(idx, idx + n);
4935     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4936 
4937     THRUSTARRAY                    *w = NULL;
4938     thrust::device_ptr<PetscScalar> dv;
4939     if (dmem) {
4940       dv = thrust::device_pointer_cast(v);
4941     } else {
4942       w  = new THRUSTARRAY(n);
4943       dv = w->data();
4944     }
4945     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4946 
4947     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4948     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4949     thrust::for_each(zibit, zieit, VecCUDAEquals());
4950     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4951     delete w;
4952   } else {
4953     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4954   }
4955   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4956   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4957   PetscFunctionReturn(PETSC_SUCCESS);
4958 }
4959