xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6eb97ccc38b40c88c1b2d075af8d591c5278bc44)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72 #endif
73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
83 
84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95 
96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97 {
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
99 
100   PetscFunctionBegin;
101   switch (op) {
102   case MAT_CUSPARSE_MULT:
103     cusparsestruct->format = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparsestruct->format = format;
107     break;
108   default:
109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*@
115    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116    operation. Only the `MatMult()` operation can use different GPU storage formats
117 
118    Not Collective
119 
120    Input Parameters:
121 +  A - Matrix of type `MATSEQAIJCUSPARSE`
122 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125 
126    Level: intermediate
127 
128 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149 
150    Input Parameters:
151 +  A - Matrix of type `MATSEQAIJCUSPARSE`
152 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
153 
154    Level: intermediate
155 
156    Note:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162 @*/
163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164 {
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172 {
173   PetscFunctionBegin;
174   switch (op) {
175   case MAT_FORM_EXPLICIT_TRANSPOSE:
176     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178     A->form_explicit_transpose = flg;
179     break;
180   default:
181     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182     break;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188 {
189   MatCUSPARSEStorageFormat format;
190   PetscBool                flg;
191   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
192 
193   PetscFunctionBegin;
194   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195   if (A->factortype == MAT_FACTOR_NONE) {
196     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198 
199     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208   #else
209     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #endif
211     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213 
214     PetscCall(
215       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217 #endif
218   }
219   PetscOptionsHeadEnd();
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225 {
226   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227   PetscInt                      m  = A->rmap->n;
228   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230   const MatScalar              *Aa = a->a;
231   PetscInt                     *Mi, *Mj, Mnz;
232   PetscScalar                  *Ma;
233 
234   PetscFunctionBegin;
235   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239       PetscCall(PetscMalloc1(m + 1, &Mi));
240       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241       PetscCall(PetscMalloc1(Mnz, &Ma));
242       Mi[0] = 0;
243       for (PetscInt i = 0; i < m; i++) {
244         PetscInt llen = Ai[i + 1] - Ai[i];
245         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249         Mi[i + 1] = Mi[i] + llen + ulen;
250       }
251       // Copy M (L,U) from host to device
252       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
257 
258       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266 
267       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270 
271       fillMode = CUSPARSE_FILL_MODE_UPPER;
272       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276 
277       // Allocate work vectors in SpSv
278       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
280 
281       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283 
284       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291 
292       // Record for reuse
293       fs->csrRowPtr_h = Mi;
294       fs->csrVal_h    = Ma;
295       PetscCall(PetscFree(Mj));
296     }
297     // Copy the value
298     Mi  = fs->csrRowPtr_h;
299     Ma  = fs->csrVal_h;
300     Mnz = Mi[m];
301     for (PetscInt i = 0; i < m; i++) {
302       PetscInt llen = Ai[i + 1] - Ai[i];
303       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307     }
308     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309 
310     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312 
313     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314 
315     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 #else
321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322 {
323   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324   PetscInt                           n                  = A->rmap->n;
325   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328   const MatScalar                   *aa = a->a, *v;
329   PetscInt                          *AiLo, *AjLo;
330   PetscInt                           i, nz, nzLower, offset, rowOffset;
331 
332   PetscFunctionBegin;
333   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335     try {
336       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337       nzLower = n + ai[n] - ai[1];
338       if (!loTriFactor) {
339         PetscScalar *AALo;
340 
341         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
342 
343         /* Allocate Space for the lower triangular matrix */
344         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
346 
347         /* Fill the lower triangular matrix */
348         AiLo[0]   = (PetscInt)0;
349         AiLo[n]   = nzLower;
350         AjLo[0]   = (PetscInt)0;
351         AALo[0]   = (MatScalar)1.0;
352         v         = aa;
353         vi        = aj;
354         offset    = 1;
355         rowOffset = 1;
356         for (i = 1; i < n; i++) {
357           nz = ai[i + 1] - ai[i];
358           /* additional 1 for the term on the diagonal */
359           AiLo[i] = rowOffset;
360           rowOffset += nz + 1;
361 
362           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
364 
365           offset += nz;
366           AjLo[offset] = (PetscInt)i;
367           AALo[offset] = (MatScalar)1.0;
368           offset += 1;
369 
370           v += nz;
371           vi += nz;
372         }
373 
374         /* allocate space for the triangular factor information */
375         PetscCall(PetscNew(&loTriFactor));
376         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377         /* Create the matrix description */
378         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382   #else
383         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384   #endif
385         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387 
388         /* set the operation */
389         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390 
391         /* set the matrix */
392         loTriFactor->csrMat              = new CsrMatrix;
393         loTriFactor->csrMat->num_rows    = n;
394         loTriFactor->csrMat->num_cols    = n;
395         loTriFactor->csrMat->num_entries = nzLower;
396 
397         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399 
400         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402 
403         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405 
406         /* Create the solve analysis information */
407         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413   #endif
414 
415         /* perform the solve analysis */
416         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418         PetscCallCUDA(WaitForCUDA());
419         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420 
421         /* assign the pointer */
422         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423         loTriFactor->AA_h                                          = AALo;
424         PetscCallCUDA(cudaFreeHost(AiLo));
425         PetscCallCUDA(cudaFreeHost(AjLo));
426         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427       } else { /* update values only */
428         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429         /* Fill the lower triangular matrix */
430         loTriFactor->AA_h[0] = 1.0;
431         v                    = aa;
432         vi                   = aj;
433         offset               = 1;
434         for (i = 1; i < n; i++) {
435           nz = ai[i + 1] - ai[i];
436           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437           offset += nz;
438           loTriFactor->AA_h[offset] = 1.0;
439           offset += 1;
440           v += nz;
441         }
442         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444       }
445     } catch (char *ex) {
446       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447     }
448   }
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453 {
454   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455   PetscInt                           n                  = A->rmap->n;
456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459   const MatScalar                   *aa = a->a, *v;
460   PetscInt                          *AiUp, *AjUp;
461   PetscInt                           i, nz, nzUpper, offset;
462 
463   PetscFunctionBegin;
464   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466     try {
467       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468       nzUpper = adiag[0] - adiag[n];
469       if (!upTriFactor) {
470         PetscScalar *AAUp;
471 
472         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
473 
474         /* Allocate Space for the upper triangular matrix */
475         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
477 
478         /* Fill the upper triangular matrix */
479         AiUp[0] = (PetscInt)0;
480         AiUp[n] = nzUpper;
481         offset  = nzUpper;
482         for (i = n - 1; i >= 0; i--) {
483           v  = aa + adiag[i + 1] + 1;
484           vi = aj + adiag[i + 1] + 1;
485 
486           /* number of elements NOT on the diagonal */
487           nz = adiag[i] - adiag[i + 1] - 1;
488 
489           /* decrement the offset */
490           offset -= (nz + 1);
491 
492           /* first, set the diagonal elements */
493           AjUp[offset] = (PetscInt)i;
494           AAUp[offset] = (MatScalar)1. / v[nz];
495           AiUp[i]      = AiUp[i + 1] - (nz + 1);
496 
497           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499         }
500 
501         /* allocate space for the triangular factor information */
502         PetscCall(PetscNew(&upTriFactor));
503         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
504 
505         /* Create the matrix description */
506         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510   #else
511         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512   #endif
513         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515 
516         /* set the operation */
517         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518 
519         /* set the matrix */
520         upTriFactor->csrMat              = new CsrMatrix;
521         upTriFactor->csrMat->num_rows    = n;
522         upTriFactor->csrMat->num_cols    = n;
523         upTriFactor->csrMat->num_entries = nzUpper;
524 
525         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527 
528         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530 
531         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533 
534         /* Create the solve analysis information */
535         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541   #endif
542 
543         /* perform the solve analysis */
544         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
546 
547         PetscCallCUDA(WaitForCUDA());
548         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549 
550         /* assign the pointer */
551         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552         upTriFactor->AA_h                                          = AAUp;
553         PetscCallCUDA(cudaFreeHost(AiUp));
554         PetscCallCUDA(cudaFreeHost(AjUp));
555         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556       } else {
557         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558         /* Fill the upper triangular matrix */
559         offset = nzUpper;
560         for (i = n - 1; i >= 0; i--) {
561           v = aa + adiag[i + 1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i + 1] - 1;
565 
566           /* decrement the offset */
567           offset -= (nz + 1);
568 
569           /* first, set the diagonal elements */
570           upTriFactor->AA_h[offset] = 1. / v[nz];
571           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572         }
573         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575       }
576     } catch (char *ex) {
577       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578     }
579   }
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 #endif
583 
584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585 {
586   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588   IS                            isrow = a->row, iscol = a->icol;
589   PetscBool                     row_identity, col_identity;
590   PetscInt                      n = A->rmap->n;
591 
592   PetscFunctionBegin;
593   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596 #else
597   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600 #endif
601 
602   cusparseTriFactors->nnz = a->nz;
603 
604   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605   /* lower triangular indices */
606   PetscCall(ISIdentity(isrow, &row_identity));
607   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608     const PetscInt *r;
609 
610     PetscCall(ISGetIndices(isrow, &r));
611     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612     cusparseTriFactors->rpermIndices->assign(r, r + n);
613     PetscCall(ISRestoreIndices(isrow, &r));
614     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615   }
616 
617   /* upper triangular indices */
618   PetscCall(ISIdentity(iscol, &col_identity));
619   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620     const PetscInt *c;
621 
622     PetscCall(ISGetIndices(iscol, &c));
623     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624     cusparseTriFactors->cpermIndices->assign(c, c + n);
625     PetscCall(ISRestoreIndices(iscol, &c));
626     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633 {
634   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635   PetscInt                      m  = A->rmap->n;
636   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638   const MatScalar              *Aa = a->a;
639   PetscInt                     *Mj, Mnz;
640   PetscScalar                  *Ma, *D;
641 
642   PetscFunctionBegin;
643   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647       Mnz = Ai[m]; // Unz (with the unit diagonal)
648       PetscCall(PetscMalloc1(Mnz, &Ma));
649       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651       for (PetscInt i = 0; i < m; i++) {
652         PetscInt ulen = Ai[i + 1] - Ai[i];
653         Mj[Ai[i]]     = i;                                              // diagonal entry
654         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655       }
656       // Copy M (U) from host to device
657       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663 
664       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672 
673       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676 
677       // Allocate work vectors in SpSv
678       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
680 
681       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683 
684       // Query buffer sizes for SpSV and then allocate buffers
685       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688 
689       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692 
693       // Record for reuse
694       fs->csrVal_h = Ma;
695       fs->diag_h   = D;
696       PetscCall(PetscFree(Mj));
697     }
698     // Copy the value
699     Ma  = fs->csrVal_h;
700     D   = fs->diag_h;
701     Mnz = Ai[m];
702     for (PetscInt i = 0; i < m; i++) {
703       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706     }
707     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709 
710     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 // Solve Ut D U x = b
718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719 {
720   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722   const PetscScalar                    *barray;
723   PetscScalar                          *xarray;
724   thrust::device_ptr<const PetscScalar> bGPU;
725   thrust::device_ptr<PetscScalar>       xGPU;
726   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727   PetscInt                              m   = A->rmap->n;
728 
729   PetscFunctionBegin;
730   PetscCall(PetscLogGpuTimeBegin());
731   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732   PetscCall(VecCUDAGetArrayRead(b, &barray));
733   xGPU = thrust::device_pointer_cast(xarray);
734   bGPU = thrust::device_pointer_cast(barray);
735 
736   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737   if (fs->rpermIndices) {
738     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740   } else {
741     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742   }
743 
744   // Solve Ut Y = X
745   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747 
748   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749   // It is basically a vector element-wise multiplication, but cublas does not have it!
750   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   /* Factorize fact inplace */
1719   if (m)
1720     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722   if (PetscDefined(USE_DEBUG)) {
1723     int              numerical_zero;
1724     cusparseStatus_t status;
1725     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727   }
1728 
1729   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731   */
1732   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1733 
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1735 
1736   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1738 
1739   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742   fact->ops->matsolve          = NULL;
1743   fact->ops->matsolvetranspose = NULL;
1744   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745   PetscFunctionReturn(PETSC_SUCCESS);
1746 }
1747 
1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749 {
1750   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752   PetscInt                      m, nz;
1753 
1754   PetscFunctionBegin;
1755   if (PetscDefined(USE_DEBUG)) {
1756     PetscInt  i;
1757     PetscBool flg, missing;
1758 
1759     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764   }
1765 
1766   /* Free the old stale stuff */
1767   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1768 
1769   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770      but they will not be used. Allocate them just for easy debugging.
1771    */
1772   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1773 
1774   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775   fact->factortype             = MAT_FACTOR_ILU;
1776   fact->info.factor_mallocs    = 0;
1777   fact->info.fill_ratio_given  = info->fill;
1778   fact->info.fill_ratio_needed = 1.0;
1779 
1780   aij->row = NULL;
1781   aij->col = NULL;
1782 
1783   /* ====================================================================== */
1784   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785   /* We'll do in-place factorization on fact                                */
1786   /* ====================================================================== */
1787   const int *Ai, *Aj;
1788 
1789   m  = fact->rmap->n;
1790   nz = aij->nz;
1791 
1792   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1798 
1799   /* ====================================================================== */
1800   /* Create descriptors for M, L, U                                         */
1801   /* ====================================================================== */
1802   cusparseFillMode_t fillMode;
1803   cusparseDiagType_t diagType;
1804 
1805   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1808 
1809   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814   */
1815   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1820 
1821   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826 
1827   /* ========================================================================= */
1828   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829   /* ========================================================================= */
1830   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831   if (m)
1832     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1834 
1835   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1837 
1838   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1840 
1841   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1846 
1847   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851    */
1852   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854     fs->spsvBuffer_L = fs->factBuffer_M;
1855     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856   } else {
1857     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858     fs->spsvBuffer_U = fs->factBuffer_M;
1859     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860   }
1861 
1862   /* ========================================================================== */
1863   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865   /* ========================================================================== */
1866   int              structural_zero;
1867   cusparseStatus_t status;
1868 
1869   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870   if (m)
1871     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873   if (PetscDefined(USE_DEBUG)) {
1874     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877   }
1878 
1879   /* Estimate FLOPs of the numeric factorization */
1880   {
1881     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1882     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883     PetscLogDouble flops = 0.0;
1884 
1885     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886     Ai    = Aseq->i;
1887     Adiag = Aseq->diag;
1888     for (PetscInt i = 0; i < m; i++) {
1889       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890         nzRow  = Ai[i + 1] - Ai[i];
1891         nzLeft = Adiag[i] - Ai[i];
1892         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894         */
1895         nzLeft = (nzRow - 1) / 2;
1896         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897       }
1898     }
1899     fs->numericFactFlops = flops;
1900   }
1901   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902   PetscFunctionReturn(PETSC_SUCCESS);
1903 }
1904 
1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906 {
1907   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909   const PetscScalar            *barray;
1910   PetscScalar                  *xarray;
1911 
1912   PetscFunctionBegin;
1913   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915   PetscCall(PetscLogGpuTimeBegin());
1916 
1917   /* Solve L*y = b */
1918   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1922 
1923   /* Solve Lt*x = y */
1924   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1927 
1928   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1930 
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937 {
1938   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941   CsrMatrix                    *Acsr;
1942   PetscInt                      m, nz;
1943   PetscBool                     flg;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949   }
1950 
1951   /* Copy A's value to fact */
1952   m  = fact->rmap->n;
1953   nz = aij->nz;
1954   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1957 
1958   /* Factorize fact inplace */
1959   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964    */
1965   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966   if (PetscDefined(USE_DEBUG)) {
1967     int              numerical_zero;
1968     cusparseStatus_t status;
1969     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971   }
1972 
1973   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1974 
1975   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977   */
1978   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1979 
1980   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983   fact->ops->matsolve          = NULL;
1984   fact->ops->matsolvetranspose = NULL;
1985   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990 {
1991   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993   PetscInt                      m, nz;
1994 
1995   PetscFunctionBegin;
1996   if (PetscDefined(USE_DEBUG)) {
1997     PetscInt  i;
1998     PetscBool flg, missing;
1999 
2000     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005   }
2006 
2007   /* Free the old stale stuff */
2008   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2009 
2010   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011      but they will not be used. Allocate them just for easy debugging.
2012    */
2013   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2014 
2015   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016   fact->factortype             = MAT_FACTOR_ICC;
2017   fact->info.factor_mallocs    = 0;
2018   fact->info.fill_ratio_given  = info->fill;
2019   fact->info.fill_ratio_needed = 1.0;
2020 
2021   aij->row = NULL;
2022   aij->col = NULL;
2023 
2024   /* ====================================================================== */
2025   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026   /* We'll do in-place factorization on fact                                */
2027   /* ====================================================================== */
2028   const int *Ai, *Aj;
2029 
2030   m  = fact->rmap->n;
2031   nz = aij->nz;
2032 
2033   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2039 
2040   /* ====================================================================== */
2041   /* Create mat descriptors for M, L                                        */
2042   /* ====================================================================== */
2043   cusparseFillMode_t fillMode;
2044   cusparseDiagType_t diagType;
2045 
2046   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2049 
2050   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055   */
2056   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2061 
2062   /* ========================================================================= */
2063   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064   /* ========================================================================= */
2065   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2067 
2068   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2070 
2071   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2073 
2074   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2079 
2080   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082    */
2083   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085     fs->spsvBuffer_L = fs->factBuffer_M;
2086     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087   } else {
2088     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089     fs->spsvBuffer_Lt = fs->factBuffer_M;
2090     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091   }
2092 
2093   /* ========================================================================== */
2094   /* Perform analysis of ic0 on M                                               */
2095   /* The lower triangular part of M has the same sparsity pattern as L          */
2096   /* ========================================================================== */
2097   int              structural_zero;
2098   cusparseStatus_t status;
2099 
2100   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102   if (PetscDefined(USE_DEBUG)) {
2103     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106   }
2107 
2108   /* Estimate FLOPs of the numeric factorization */
2109   {
2110     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2111     PetscInt      *Ai, nzRow, nzLeft;
2112     PetscLogDouble flops = 0.0;
2113 
2114     Ai = Aseq->i;
2115     for (PetscInt i = 0; i < m; i++) {
2116       nzRow = Ai[i + 1] - Ai[i];
2117       if (nzRow > 1) {
2118         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120         */
2121         nzLeft = (nzRow - 1) / 2;
2122         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123       }
2124     }
2125     fs->numericFactFlops = flops;
2126   }
2127   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 #endif
2131 
2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133 {
2134   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2135   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2136 
2137   PetscFunctionBegin;
2138   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2139   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2140   B->offloadmask = PETSC_OFFLOAD_CPU;
2141 
2142   if (!cusparsestruct->use_cpu_solve) {
2143 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2144     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2145     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2146 #else
2147     /* determine which version of MatSolve needs to be used. */
2148     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2149     IS          isrow = b->row, iscol = b->col;
2150     PetscBool   row_identity, col_identity;
2151 
2152     PetscCall(ISIdentity(isrow, &row_identity));
2153     PetscCall(ISIdentity(iscol, &col_identity));
2154     if (row_identity && col_identity) {
2155       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2156       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2157     } else {
2158       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2159       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2160     }
2161 #endif
2162   }
2163   B->ops->matsolve          = NULL;
2164   B->ops->matsolvetranspose = NULL;
2165 
2166   /* get the triangular factors */
2167   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 
2171 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2172 {
2173   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2174 
2175   PetscFunctionBegin;
2176   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2177   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2178   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2179   PetscFunctionReturn(PETSC_SUCCESS);
2180 }
2181 
2182 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2183 {
2184   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2185 
2186   PetscFunctionBegin;
2187 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2188   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2189   if (cusparseTriFactors->factorizeOnDevice) {
2190     PetscCall(ISIdentity(isrow, &row_identity));
2191     PetscCall(ISIdentity(iscol, &col_identity));
2192   }
2193   if (!info->levels && row_identity && col_identity) {
2194     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2195   } else
2196 #endif
2197   {
2198     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2199     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2200     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2201   }
2202   PetscFunctionReturn(PETSC_SUCCESS);
2203 }
2204 
2205 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2206 {
2207   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2208 
2209   PetscFunctionBegin;
2210 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2211   PetscBool perm_identity = PETSC_FALSE;
2212   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2213   if (!info->levels && perm_identity) {
2214     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2215   } else
2216 #endif
2217   {
2218     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2219     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2220     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2221   }
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2226 {
2227   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2228 
2229   PetscFunctionBegin;
2230   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2231   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2232   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2233   PetscFunctionReturn(PETSC_SUCCESS);
2234 }
2235 
2236 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2237 {
2238   PetscFunctionBegin;
2239   *type = MATSOLVERCUSPARSE;
2240   PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242 
2243 /*MC
2244   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2245   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2246   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2247   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2248   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2249   algorithms are not recommended. This class does NOT support direct solver operations.
2250 
2251   Level: beginner
2252 
2253 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2254           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2255 M*/
2256 
2257 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2258 {
2259   PetscInt  n = A->rmap->n;
2260   PetscBool factOnDevice, factOnHost;
2261   char     *prefix;
2262   char      factPlace[32] = "device"; /* the default */
2263 
2264   PetscFunctionBegin;
2265   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266   PetscCall(MatSetSizes(*B, n, n, n, n));
2267   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2269 
2270   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2271   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2272   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2273   PetscOptionsEnd();
2274   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2275   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2276   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2277   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2278 
2279   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2280   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2281     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2282     if (!A->boundtocpu) {
2283       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2284       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2285     } else {
2286       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2287       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2288     }
2289     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2290     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2291     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2292   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2293     if (!A->boundtocpu) {
2294       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2295       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2296     } else {
2297       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2298       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2299     }
2300     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2301     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2302   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2303 
2304   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2305   (*B)->canuseordering = PETSC_TRUE;
2306   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2307   PetscFunctionReturn(PETSC_SUCCESS);
2308 }
2309 
2310 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2311 {
2312   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2313   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2315   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2316 #endif
2317 
2318   PetscFunctionBegin;
2319   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2320     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2321     if (A->factortype == MAT_FACTOR_NONE) {
2322       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2323       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2324     }
2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326     else if (fs->csrVal) {
2327       /* We have a factorized matrix on device and are able to copy it to host */
2328       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329     }
2330 #endif
2331     else
2332       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2333     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2334     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2335     A->offloadmask = PETSC_OFFLOAD_BOTH;
2336   }
2337   PetscFunctionReturn(PETSC_SUCCESS);
2338 }
2339 
2340 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2341 {
2342   PetscFunctionBegin;
2343   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2344   *array = ((Mat_SeqAIJ *)A->data)->a;
2345   PetscFunctionReturn(PETSC_SUCCESS);
2346 }
2347 
2348 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2349 {
2350   PetscFunctionBegin;
2351   A->offloadmask = PETSC_OFFLOAD_CPU;
2352   *array         = NULL;
2353   PetscFunctionReturn(PETSC_SUCCESS);
2354 }
2355 
2356 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2357 {
2358   PetscFunctionBegin;
2359   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2360   *array = ((Mat_SeqAIJ *)A->data)->a;
2361   PetscFunctionReturn(PETSC_SUCCESS);
2362 }
2363 
2364 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2365 {
2366   PetscFunctionBegin;
2367   *array = NULL;
2368   PetscFunctionReturn(PETSC_SUCCESS);
2369 }
2370 
2371 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2372 {
2373   PetscFunctionBegin;
2374   *array = ((Mat_SeqAIJ *)A->data)->a;
2375   PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377 
2378 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379 {
2380   PetscFunctionBegin;
2381   A->offloadmask = PETSC_OFFLOAD_CPU;
2382   *array         = NULL;
2383   PetscFunctionReturn(PETSC_SUCCESS);
2384 }
2385 
2386 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2387 {
2388   Mat_SeqAIJCUSPARSE *cusp;
2389   CsrMatrix          *matrix;
2390 
2391   PetscFunctionBegin;
2392   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2393   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2394   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2395   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2396   matrix = (CsrMatrix *)cusp->mat->mat;
2397 
2398   if (i) {
2399 #if !defined(PETSC_USE_64BIT_INDICES)
2400     *i = matrix->row_offsets->data().get();
2401 #else
2402     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2403 #endif
2404   }
2405   if (j) {
2406 #if !defined(PETSC_USE_64BIT_INDICES)
2407     *j = matrix->column_indices->data().get();
2408 #else
2409     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410 #endif
2411   }
2412   if (a) *a = matrix->values->data().get();
2413   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2414   PetscFunctionReturn(PETSC_SUCCESS);
2415 }
2416 
2417 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2418 {
2419   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2420   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2421   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2422   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2423   cusparseStatus_t              stat;
2424   PetscBool                     both = PETSC_TRUE;
2425 
2426   PetscFunctionBegin;
2427   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2428   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2429     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2430       CsrMatrix *matrix;
2431       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2432 
2433       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2434       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2435       matrix->values->assign(a->a, a->a + a->nz);
2436       PetscCallCUDA(WaitForCUDA());
2437       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2438       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2439       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2440     } else {
2441       PetscInt nnz;
2442       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2443       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2444       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2445       delete cusparsestruct->workVector;
2446       delete cusparsestruct->rowoffsets_gpu;
2447       cusparsestruct->workVector     = NULL;
2448       cusparsestruct->rowoffsets_gpu = NULL;
2449       try {
2450         if (a->compressedrow.use) {
2451           m    = a->compressedrow.nrows;
2452           ii   = a->compressedrow.i;
2453           ridx = a->compressedrow.rindex;
2454         } else {
2455           m    = A->rmap->n;
2456           ii   = a->i;
2457           ridx = NULL;
2458         }
2459         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2460         if (!a->a) {
2461           nnz  = ii[m];
2462           both = PETSC_FALSE;
2463         } else nnz = a->nz;
2464         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2465 
2466         /* create cusparse matrix */
2467         cusparsestruct->nrows = m;
2468         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2469         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2470         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2471         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2472 
2473         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2474         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2475         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2480 
2481         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2482         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2483           /* set the matrix */
2484           CsrMatrix *mat   = new CsrMatrix;
2485           mat->num_rows    = m;
2486           mat->num_cols    = A->cmap->n;
2487           mat->num_entries = nnz;
2488           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2489           mat->row_offsets->assign(ii, ii + m + 1);
2490 
2491           mat->column_indices = new THRUSTINTARRAY32(nnz);
2492           mat->column_indices->assign(a->j, a->j + nnz);
2493 
2494           mat->values = new THRUSTARRAY(nnz);
2495           if (a->a) mat->values->assign(a->a, a->a + nnz);
2496 
2497           /* assign the pointer */
2498           matstruct->mat = mat;
2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503             PetscCallCUSPARSE(stat);
2504           }
2505 #endif
2506         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509 #else
2510           CsrMatrix *mat   = new CsrMatrix;
2511           mat->num_rows    = m;
2512           mat->num_cols    = A->cmap->n;
2513           mat->num_entries = nnz;
2514           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2515           mat->row_offsets->assign(ii, ii + m + 1);
2516 
2517           mat->column_indices = new THRUSTINTARRAY32(nnz);
2518           mat->column_indices->assign(a->j, a->j + nnz);
2519 
2520           mat->values = new THRUSTARRAY(nnz);
2521           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522 
2523           cusparseHybMat_t hybMat;
2524           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527           PetscCallCUSPARSE(stat);
2528           /* assign the pointer */
2529           matstruct->mat = hybMat;
2530 
2531           if (mat) {
2532             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535             delete (CsrMatrix *)mat;
2536           }
2537 #endif
2538         }
2539 
2540         /* assign the compressed row indices */
2541         if (a->compressedrow.use) {
2542           cusparsestruct->workVector = new THRUSTARRAY(m);
2543           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2544           matstruct->cprowIndices->assign(ridx, ridx + m);
2545           tmp = m;
2546         } else {
2547           cusparsestruct->workVector = NULL;
2548           matstruct->cprowIndices    = NULL;
2549           tmp                        = 0;
2550         }
2551         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2552 
2553         /* assign the pointer */
2554         cusparsestruct->mat = matstruct;
2555       } catch (char *ex) {
2556         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557       }
2558       PetscCallCUDA(WaitForCUDA());
2559       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560       cusparsestruct->nonzerostate = A->nonzerostate;
2561     }
2562     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563   }
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 struct VecCUDAPlusEquals {
2568   template <typename Tuple>
2569   __host__ __device__ void operator()(Tuple t)
2570   {
2571     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572   }
2573 };
2574 
2575 struct VecCUDAEquals {
2576   template <typename Tuple>
2577   __host__ __device__ void operator()(Tuple t)
2578   {
2579     thrust::get<1>(t) = thrust::get<0>(t);
2580   }
2581 };
2582 
2583 struct VecCUDAEqualsReverse {
2584   template <typename Tuple>
2585   __host__ __device__ void operator()(Tuple t)
2586   {
2587     thrust::get<0>(t) = thrust::get<1>(t);
2588   }
2589 };
2590 
2591 struct MatMatCusparse {
2592   PetscBool      cisdense;
2593   PetscScalar   *Bt;
2594   Mat            X;
2595   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596   PetscLogDouble flops;
2597   CsrMatrix     *Bcsr;
2598 
2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600   cusparseSpMatDescr_t matSpBDescr;
2601   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602   cusparseDnMatDescr_t matBDescr;
2603   cusparseDnMatDescr_t matCDescr;
2604   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606   void *dBuffer4;
2607   void *dBuffer5;
2608   #endif
2609   size_t                mmBufferSize;
2610   void                 *mmBuffer;
2611   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612   cusparseSpGEMMDescr_t spgemmDesc;
2613 #endif
2614 };
2615 
2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617 {
2618   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2619 
2620   PetscFunctionBegin;
2621   PetscCallCUDA(cudaFree(mmdata->Bt));
2622   delete mmdata->Bcsr;
2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631   #endif
2632   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634 #endif
2635   PetscCall(MatDestroy(&mmdata->X));
2636   PetscCall(PetscFree(data));
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2641 
2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643 {
2644   Mat_Product                  *product = C->product;
2645   Mat                           A, B;
2646   PetscInt                      m, n, blda, clda;
2647   PetscBool                     flg, biscuda;
2648   Mat_SeqAIJCUSPARSE           *cusp;
2649   cusparseStatus_t              stat;
2650   cusparseOperation_t           opA;
2651   const PetscScalar            *barray;
2652   PetscScalar                  *carray;
2653   MatMatCusparse               *mmdata;
2654   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655   CsrMatrix                    *csrmat;
2656 
2657   PetscFunctionBegin;
2658   MatCheckProduct(C, 1);
2659   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660   mmdata = (MatMatCusparse *)product->data;
2661   A      = product->A;
2662   B      = product->B;
2663   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670   switch (product->type) {
2671   case MATPRODUCT_AB:
2672   case MATPRODUCT_PtAP:
2673     mat = cusp->mat;
2674     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675     m   = A->rmap->n;
2676     n   = B->cmap->n;
2677     break;
2678   case MATPRODUCT_AtB:
2679     if (!A->form_explicit_transpose) {
2680       mat = cusp->mat;
2681       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682     } else {
2683       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684       mat = cusp->matTranspose;
2685       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686     }
2687     m = A->cmap->n;
2688     n = B->cmap->n;
2689     break;
2690   case MATPRODUCT_ABt:
2691   case MATPRODUCT_RARt:
2692     mat = cusp->mat;
2693     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694     m   = A->rmap->n;
2695     n   = B->rmap->n;
2696     break;
2697   default:
2698     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699   }
2700   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701   csrmat = (CsrMatrix *)mat->mat;
2702   /* if the user passed a CPU matrix, copy the data to the GPU */
2703   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2706 
2707   PetscCall(MatDenseGetLDA(B, &blda));
2708   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711   } else {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(C, &clda));
2714   }
2715 
2716   PetscCall(PetscLogGpuTimeBegin());
2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2720   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2721     size_t mmBufferSize;
2722     if (mmdata->initialized && mmdata->Blda != blda) {
2723       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2724       mmdata->matBDescr = NULL;
2725     }
2726     if (!mmdata->matBDescr) {
2727       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2728       mmdata->Blda = blda;
2729     }
2730 
2731     if (mmdata->initialized && mmdata->Clda != clda) {
2732       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2733       mmdata->matCDescr = NULL;
2734     }
2735     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2736       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737       mmdata->Clda = clda;
2738     }
2739 
2740     if (!mat->matDescr) {
2741       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2742                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2743       PetscCallCUSPARSE(stat);
2744     }
2745     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2746     PetscCallCUSPARSE(stat);
2747     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2748       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2749       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2750       mmdata->mmBufferSize = mmBufferSize;
2751     }
2752     mmdata->initialized = PETSC_TRUE;
2753   } else {
2754     /* to be safe, always update pointers of the mats */
2755     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2756     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2757     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2758   }
2759 
2760   /* do cusparseSpMM, which supports transpose on B */
2761   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2762   PetscCallCUSPARSE(stat);
2763 #else
2764   PetscInt k;
2765   /* cusparseXcsrmm does not support transpose on B */
2766   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2767     cublasHandle_t cublasv2handle;
2768     cublasStatus_t cerr;
2769 
2770     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2771     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2772     PetscCallCUBLAS(cerr);
2773     blda = B->cmap->n;
2774     k    = B->cmap->n;
2775   } else {
2776     k = B->rmap->n;
2777   }
2778 
2779   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2780   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2781   PetscCallCUSPARSE(stat);
2782 #endif
2783   PetscCall(PetscLogGpuTimeEnd());
2784   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2785   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2786   if (product->type == MATPRODUCT_RARt) {
2787     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2788     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2789   } else if (product->type == MATPRODUCT_PtAP) {
2790     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2792   } else {
2793     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2794   }
2795   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2796   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2797   PetscFunctionReturn(PETSC_SUCCESS);
2798 }
2799 
2800 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2801 {
2802   Mat_Product        *product = C->product;
2803   Mat                 A, B;
2804   PetscInt            m, n;
2805   PetscBool           cisdense, flg;
2806   MatMatCusparse     *mmdata;
2807   Mat_SeqAIJCUSPARSE *cusp;
2808 
2809   PetscFunctionBegin;
2810   MatCheckProduct(C, 1);
2811   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2812   A = product->A;
2813   B = product->B;
2814   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2815   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2816   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2817   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2818   switch (product->type) {
2819   case MATPRODUCT_AB:
2820     m = A->rmap->n;
2821     n = B->cmap->n;
2822     break;
2823   case MATPRODUCT_AtB:
2824     m = A->cmap->n;
2825     n = B->cmap->n;
2826     break;
2827   case MATPRODUCT_ABt:
2828     m = A->rmap->n;
2829     n = B->rmap->n;
2830     break;
2831   case MATPRODUCT_PtAP:
2832     m = B->cmap->n;
2833     n = B->cmap->n;
2834     break;
2835   case MATPRODUCT_RARt:
2836     m = B->rmap->n;
2837     n = B->rmap->n;
2838     break;
2839   default:
2840     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2841   }
2842   PetscCall(MatSetSizes(C, m, n, m, n));
2843   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2844   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2845   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2846 
2847   /* product data */
2848   PetscCall(PetscNew(&mmdata));
2849   mmdata->cisdense = cisdense;
2850 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2851   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2852   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2853 #endif
2854   /* for these products we need intermediate storage */
2855   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2856     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2857     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2858     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2859       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2860     } else {
2861       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2862     }
2863   }
2864   C->product->data    = mmdata;
2865   C->product->destroy = MatDestroy_MatMatCusparse;
2866 
2867   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2868   PetscFunctionReturn(PETSC_SUCCESS);
2869 }
2870 
2871 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2872 {
2873   Mat_Product                  *product = C->product;
2874   Mat                           A, B;
2875   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2876   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2877   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2878   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2879   PetscBool                     flg;
2880   cusparseStatus_t              stat;
2881   MatProductType                ptype;
2882   MatMatCusparse               *mmdata;
2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884   cusparseSpMatDescr_t BmatSpDescr;
2885 #endif
2886   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2887 
2888   PetscFunctionBegin;
2889   MatCheckProduct(C, 1);
2890   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2891   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2892   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2893   mmdata = (MatMatCusparse *)C->product->data;
2894   A      = product->A;
2895   B      = product->B;
2896   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2897     mmdata->reusesym = PETSC_FALSE;
2898     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2899     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2900     Cmat = Ccusp->mat;
2901     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2902     Ccsr = (CsrMatrix *)Cmat->mat;
2903     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2904     goto finalize;
2905   }
2906   if (!c->nz) goto finalize;
2907   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2908   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2909   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2910   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2911   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2913   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2914   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2915   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2916   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2919   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2920   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2921 
2922   ptype = product->type;
2923   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2924     ptype = MATPRODUCT_AB;
2925     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2926   }
2927   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2928     ptype = MATPRODUCT_AB;
2929     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2930   }
2931   switch (ptype) {
2932   case MATPRODUCT_AB:
2933     Amat = Acusp->mat;
2934     Bmat = Bcusp->mat;
2935     break;
2936   case MATPRODUCT_AtB:
2937     Amat = Acusp->matTranspose;
2938     Bmat = Bcusp->mat;
2939     break;
2940   case MATPRODUCT_ABt:
2941     Amat = Acusp->mat;
2942     Bmat = Bcusp->matTranspose;
2943     break;
2944   default:
2945     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2946   }
2947   Cmat = Ccusp->mat;
2948   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2949   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2950   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2951   Acsr = (CsrMatrix *)Amat->mat;
2952   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2953   Ccsr = (CsrMatrix *)Cmat->mat;
2954   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2955   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2956   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957   PetscCall(PetscLogGpuTimeBegin());
2958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2959   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2960   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2961   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2962   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2963   PetscCallCUSPARSE(stat);
2964   #else
2965   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2966   PetscCallCUSPARSE(stat);
2967   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2968   PetscCallCUSPARSE(stat);
2969   #endif
2970 #else
2971   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2972                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2973   PetscCallCUSPARSE(stat);
2974 #endif
2975   PetscCall(PetscLogGpuFlops(mmdata->flops));
2976   PetscCallCUDA(WaitForCUDA());
2977   PetscCall(PetscLogGpuTimeEnd());
2978   C->offloadmask = PETSC_OFFLOAD_GPU;
2979 finalize:
2980   /* shorter version of MatAssemblyEnd_SeqAIJ */
2981   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2982   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2983   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2984   c->reallocs = 0;
2985   C->info.mallocs += 0;
2986   C->info.nz_unneeded = 0;
2987   C->assembled = C->was_assembled = PETSC_TRUE;
2988   C->num_ass++;
2989   PetscFunctionReturn(PETSC_SUCCESS);
2990 }
2991 
2992 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2993 {
2994   Mat_Product                  *product = C->product;
2995   Mat                           A, B;
2996   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2997   Mat_SeqAIJ                   *a, *b, *c;
2998   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2999   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3000   PetscInt                      i, j, m, n, k;
3001   PetscBool                     flg;
3002   cusparseStatus_t              stat;
3003   MatProductType                ptype;
3004   MatMatCusparse               *mmdata;
3005   PetscLogDouble                flops;
3006   PetscBool                     biscompressed, ciscompressed;
3007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3008   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3009   cusparseSpMatDescr_t BmatSpDescr;
3010 #else
3011   int cnz;
3012 #endif
3013   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3014 
3015   PetscFunctionBegin;
3016   MatCheckProduct(C, 1);
3017   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3018   A = product->A;
3019   B = product->B;
3020   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3021   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3022   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3023   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3024   a = (Mat_SeqAIJ *)A->data;
3025   b = (Mat_SeqAIJ *)B->data;
3026   /* product data */
3027   PetscCall(PetscNew(&mmdata));
3028   C->product->data    = mmdata;
3029   C->product->destroy = MatDestroy_MatMatCusparse;
3030 
3031   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3032   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3033   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3034   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3035   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3037 
3038   ptype = product->type;
3039   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3040     ptype                                          = MATPRODUCT_AB;
3041     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3042   }
3043   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3044     ptype                                          = MATPRODUCT_AB;
3045     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3046   }
3047   biscompressed = PETSC_FALSE;
3048   ciscompressed = PETSC_FALSE;
3049   switch (ptype) {
3050   case MATPRODUCT_AB:
3051     m    = A->rmap->n;
3052     n    = B->cmap->n;
3053     k    = A->cmap->n;
3054     Amat = Acusp->mat;
3055     Bmat = Bcusp->mat;
3056     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3057     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3058     break;
3059   case MATPRODUCT_AtB:
3060     m = A->cmap->n;
3061     n = B->cmap->n;
3062     k = A->rmap->n;
3063     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3064     Amat = Acusp->matTranspose;
3065     Bmat = Bcusp->mat;
3066     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3067     break;
3068   case MATPRODUCT_ABt:
3069     m = A->rmap->n;
3070     n = B->rmap->n;
3071     k = A->cmap->n;
3072     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3073     Amat = Acusp->mat;
3074     Bmat = Bcusp->matTranspose;
3075     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3076     break;
3077   default:
3078     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3079   }
3080 
3081   /* create cusparse matrix */
3082   PetscCall(MatSetSizes(C, m, n, m, n));
3083   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3084   c     = (Mat_SeqAIJ *)C->data;
3085   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3086   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3087   Ccsr  = new CsrMatrix;
3088 
3089   c->compressedrow.use = ciscompressed;
3090   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3091     c->compressedrow.nrows = a->compressedrow.nrows;
3092     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3093     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3094     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3095     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3096     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3097   } else {
3098     c->compressedrow.nrows  = 0;
3099     c->compressedrow.i      = NULL;
3100     c->compressedrow.rindex = NULL;
3101     Ccusp->workVector       = NULL;
3102     Cmat->cprowIndices      = NULL;
3103   }
3104   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3105   Ccusp->mat        = Cmat;
3106   Ccusp->mat->mat   = Ccsr;
3107   Ccsr->num_rows    = Ccusp->nrows;
3108   Ccsr->num_cols    = n;
3109   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3110   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3111   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3112   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3113   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3114   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3115   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3116   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3119   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3120     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3121     c->nz                = 0;
3122     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3123     Ccsr->values         = new THRUSTARRAY(c->nz);
3124     goto finalizesym;
3125   }
3126 
3127   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3128   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3129   Acsr = (CsrMatrix *)Amat->mat;
3130   if (!biscompressed) {
3131     Bcsr = (CsrMatrix *)Bmat->mat;
3132 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3133     BmatSpDescr = Bmat->matDescr;
3134 #endif
3135   } else { /* we need to use row offsets for the full matrix */
3136     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3137     Bcsr                 = new CsrMatrix;
3138     Bcsr->num_rows       = B->rmap->n;
3139     Bcsr->num_cols       = cBcsr->num_cols;
3140     Bcsr->num_entries    = cBcsr->num_entries;
3141     Bcsr->column_indices = cBcsr->column_indices;
3142     Bcsr->values         = cBcsr->values;
3143     if (!Bcusp->rowoffsets_gpu) {
3144       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3145       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3146       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3147     }
3148     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3149     mmdata->Bcsr      = Bcsr;
3150 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151     if (Bcsr->num_rows && Bcsr->num_cols) {
3152       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3153       PetscCallCUSPARSE(stat);
3154     }
3155     BmatSpDescr = mmdata->matSpBDescr;
3156 #endif
3157   }
3158   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3159   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3160   /* precompute flops count */
3161   if (ptype == MATPRODUCT_AB) {
3162     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3163       const PetscInt st = a->i[i];
3164       const PetscInt en = a->i[i + 1];
3165       for (j = st; j < en; j++) {
3166         const PetscInt brow = a->j[j];
3167         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3168       }
3169     }
3170   } else if (ptype == MATPRODUCT_AtB) {
3171     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3172       const PetscInt anzi = a->i[i + 1] - a->i[i];
3173       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3174       flops += (2. * anzi) * bnzi;
3175     }
3176   } else { /* TODO */
3177     flops = 0.;
3178   }
3179 
3180   mmdata->flops = flops;
3181   PetscCall(PetscLogGpuTimeBegin());
3182 
3183 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3185   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3186   PetscCallCUSPARSE(stat);
3187   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3188   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3189   {
3190     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3191      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3192   */
3193     void *dBuffer1 = NULL;
3194     void *dBuffer2 = NULL;
3195     void *dBuffer3 = NULL;
3196     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3197     size_t bufferSize1 = 0;
3198     size_t bufferSize2 = 0;
3199     size_t bufferSize3 = 0;
3200     size_t bufferSize4 = 0;
3201     size_t bufferSize5 = 0;
3202 
3203     /* ask bufferSize1 bytes for external memory */
3204     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3205     PetscCallCUSPARSE(stat);
3206     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3207     /* inspect the matrices A and B to understand the memory requirement for the next step */
3208     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3209     PetscCallCUSPARSE(stat);
3210 
3211     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3212     PetscCallCUSPARSE(stat);
3213     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3214     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3215     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3216     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3217     PetscCallCUSPARSE(stat);
3218     PetscCallCUDA(cudaFree(dBuffer1));
3219     PetscCallCUDA(cudaFree(dBuffer2));
3220 
3221     /* get matrix C non-zero entries C_nnz1 */
3222     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3223     c->nz = (PetscInt)C_nnz1;
3224     /* allocate matrix C */
3225     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3226     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3227     Ccsr->values = new THRUSTARRAY(c->nz);
3228     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3229     /* update matC with the new pointers */
3230     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3231     PetscCallCUSPARSE(stat);
3232 
3233     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3234     PetscCallCUSPARSE(stat);
3235     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3236     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3237     PetscCallCUSPARSE(stat);
3238     PetscCallCUDA(cudaFree(dBuffer3));
3239     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3240     PetscCallCUSPARSE(stat);
3241     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3242   }
3243   #else
3244   size_t bufSize2;
3245   /* ask bufferSize bytes for external memory */
3246   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3247   PetscCallCUSPARSE(stat);
3248   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3249   /* inspect the matrices A and B to understand the memory requirement for the next step */
3250   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3251   PetscCallCUSPARSE(stat);
3252   /* ask bufferSize again bytes for external memory */
3253   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3254   PetscCallCUSPARSE(stat);
3255   /* The CUSPARSE documentation is not clear, nor the API
3256      We need both buffers to perform the operations properly!
3257      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3258      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3259      is stored in the descriptor! What a messy API... */
3260   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3261   /* compute the intermediate product of A * B */
3262   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3263   PetscCallCUSPARSE(stat);
3264   /* get matrix C non-zero entries C_nnz1 */
3265   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3266   c->nz = (PetscInt)C_nnz1;
3267   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3268                       mmdata->mmBufferSize / 1024));
3269   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3270   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3271   Ccsr->values = new THRUSTARRAY(c->nz);
3272   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3273   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3274   PetscCallCUSPARSE(stat);
3275   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3276   PetscCallCUSPARSE(stat);
3277   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3278 #else
3279   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3280   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3281                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3282   PetscCallCUSPARSE(stat);
3283   c->nz                = cnz;
3284   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3285   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3286   Ccsr->values = new THRUSTARRAY(c->nz);
3287   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3288 
3289   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3290   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3291      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3292      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3293   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3294                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3295   PetscCallCUSPARSE(stat);
3296 #endif
3297   PetscCall(PetscLogGpuFlops(mmdata->flops));
3298   PetscCall(PetscLogGpuTimeEnd());
3299 finalizesym:
3300   c->singlemalloc = PETSC_FALSE;
3301   c->free_a       = PETSC_TRUE;
3302   c->free_ij      = PETSC_TRUE;
3303   PetscCall(PetscMalloc1(m + 1, &c->i));
3304   PetscCall(PetscMalloc1(c->nz, &c->j));
3305   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3306     PetscInt      *d_i = c->i;
3307     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3308     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3309     ii = *Ccsr->row_offsets;
3310     jj = *Ccsr->column_indices;
3311     if (ciscompressed) d_i = c->compressedrow.i;
3312     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3313     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3314   } else {
3315     PetscInt *d_i = c->i;
3316     if (ciscompressed) d_i = c->compressedrow.i;
3317     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3318     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3319   }
3320   if (ciscompressed) { /* need to expand host row offsets */
3321     PetscInt r = 0;
3322     c->i[0]    = 0;
3323     for (k = 0; k < c->compressedrow.nrows; k++) {
3324       const PetscInt next = c->compressedrow.rindex[k];
3325       const PetscInt old  = c->compressedrow.i[k];
3326       for (; r < next; r++) c->i[r + 1] = old;
3327     }
3328     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3329   }
3330   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3331   PetscCall(PetscMalloc1(m, &c->ilen));
3332   PetscCall(PetscMalloc1(m, &c->imax));
3333   c->maxnz         = c->nz;
3334   c->nonzerorowcnt = 0;
3335   c->rmax          = 0;
3336   for (k = 0; k < m; k++) {
3337     const PetscInt nn = c->i[k + 1] - c->i[k];
3338     c->ilen[k] = c->imax[k] = nn;
3339     c->nonzerorowcnt += (PetscInt) !!nn;
3340     c->rmax = PetscMax(c->rmax, nn);
3341   }
3342   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3343   PetscCall(PetscMalloc1(c->nz, &c->a));
3344   Ccsr->num_entries = c->nz;
3345 
3346   C->nonzerostate++;
3347   PetscCall(PetscLayoutSetUp(C->rmap));
3348   PetscCall(PetscLayoutSetUp(C->cmap));
3349   Ccusp->nonzerostate = C->nonzerostate;
3350   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3351   C->preallocated     = PETSC_TRUE;
3352   C->assembled        = PETSC_FALSE;
3353   C->was_assembled    = PETSC_FALSE;
3354   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3355     mmdata->reusesym = PETSC_TRUE;
3356     C->offloadmask   = PETSC_OFFLOAD_GPU;
3357   }
3358   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3359   PetscFunctionReturn(PETSC_SUCCESS);
3360 }
3361 
3362 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3363 
3364 /* handles sparse or dense B */
3365 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3366 {
3367   Mat_Product *product = mat->product;
3368   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3369 
3370   PetscFunctionBegin;
3371   MatCheckProduct(mat, 1);
3372   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3373   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3374   if (product->type == MATPRODUCT_ABC) {
3375     Ciscusp = PETSC_FALSE;
3376     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3377   }
3378   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3379     PetscBool usecpu = PETSC_FALSE;
3380     switch (product->type) {
3381     case MATPRODUCT_AB:
3382       if (product->api_user) {
3383         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3384         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3385         PetscOptionsEnd();
3386       } else {
3387         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3388         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3389         PetscOptionsEnd();
3390       }
3391       break;
3392     case MATPRODUCT_AtB:
3393       if (product->api_user) {
3394         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3395         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3396         PetscOptionsEnd();
3397       } else {
3398         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3399         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3400         PetscOptionsEnd();
3401       }
3402       break;
3403     case MATPRODUCT_PtAP:
3404       if (product->api_user) {
3405         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3406         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3407         PetscOptionsEnd();
3408       } else {
3409         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3410         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3411         PetscOptionsEnd();
3412       }
3413       break;
3414     case MATPRODUCT_RARt:
3415       if (product->api_user) {
3416         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3417         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3418         PetscOptionsEnd();
3419       } else {
3420         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3421         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3422         PetscOptionsEnd();
3423       }
3424       break;
3425     case MATPRODUCT_ABC:
3426       if (product->api_user) {
3427         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3428         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3429         PetscOptionsEnd();
3430       } else {
3431         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3432         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3433         PetscOptionsEnd();
3434       }
3435       break;
3436     default:
3437       break;
3438     }
3439     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3440   }
3441   /* dispatch */
3442   if (isdense) {
3443     switch (product->type) {
3444     case MATPRODUCT_AB:
3445     case MATPRODUCT_AtB:
3446     case MATPRODUCT_ABt:
3447     case MATPRODUCT_PtAP:
3448     case MATPRODUCT_RARt:
3449       if (product->A->boundtocpu) {
3450         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3451       } else {
3452         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3453       }
3454       break;
3455     case MATPRODUCT_ABC:
3456       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3457       break;
3458     default:
3459       break;
3460     }
3461   } else if (Biscusp && Ciscusp) {
3462     switch (product->type) {
3463     case MATPRODUCT_AB:
3464     case MATPRODUCT_AtB:
3465     case MATPRODUCT_ABt:
3466       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3467       break;
3468     case MATPRODUCT_PtAP:
3469     case MATPRODUCT_RARt:
3470     case MATPRODUCT_ABC:
3471       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3472       break;
3473     default:
3474       break;
3475     }
3476   } else { /* fallback for AIJ */
3477     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3478   }
3479   PetscFunctionReturn(PETSC_SUCCESS);
3480 }
3481 
3482 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3483 {
3484   PetscFunctionBegin;
3485   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3486   PetscFunctionReturn(PETSC_SUCCESS);
3487 }
3488 
3489 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3490 {
3491   PetscFunctionBegin;
3492   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3493   PetscFunctionReturn(PETSC_SUCCESS);
3494 }
3495 
3496 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3497 {
3498   PetscFunctionBegin;
3499   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3500   PetscFunctionReturn(PETSC_SUCCESS);
3501 }
3502 
3503 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3504 {
3505   PetscFunctionBegin;
3506   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3507   PetscFunctionReturn(PETSC_SUCCESS);
3508 }
3509 
3510 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3511 {
3512   PetscFunctionBegin;
3513   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3514   PetscFunctionReturn(PETSC_SUCCESS);
3515 }
3516 
3517 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3518 {
3519   int i = blockIdx.x * blockDim.x + threadIdx.x;
3520   if (i < n) y[idx[i]] += x[i];
3521 }
3522 
3523 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3524 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3525 {
3526   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3527   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3528   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3529   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3530   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3531   PetscBool                     compressed;
3532 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3533   PetscInt nx, ny;
3534 #endif
3535 
3536   PetscFunctionBegin;
3537   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3538   if (!a->nz) {
3539     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3540     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3541     PetscFunctionReturn(PETSC_SUCCESS);
3542   }
3543   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3544   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3545   if (!trans) {
3546     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3547     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3548   } else {
3549     if (herm || !A->form_explicit_transpose) {
3550       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3551       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3552     } else {
3553       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3554       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3555     }
3556   }
3557   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3558   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3559 
3560   try {
3561     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3562     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3563     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3564 
3565     PetscCall(PetscLogGpuTimeBegin());
3566     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3567       /* z = A x + beta y.
3568          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3569          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3570       */
3571       xptr = xarray;
3572       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3573       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3574 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3575       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3576           allocated to accommodate different uses. So we get the length info directly from mat.
3577        */
3578       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3579         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3580         nx             = mat->num_cols;
3581         ny             = mat->num_rows;
3582       }
3583 #endif
3584     } else {
3585       /* z = A^T x + beta y
3586          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3587          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3588        */
3589       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3590       dptr = zarray;
3591       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3592       if (compressed) { /* Scatter x to work vector */
3593         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3594 
3595         thrust::for_each(
3596 #if PetscDefined(HAVE_THRUST_ASYNC)
3597           thrust::cuda::par.on(PetscDefaultCudaStream),
3598 #endif
3599           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3600           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3601       }
3602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3603       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3604         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3605         nx             = mat->num_rows;
3606         ny             = mat->num_cols;
3607       }
3608 #endif
3609     }
3610 
3611     /* csr_spmv does y = alpha op(A) x + beta y */
3612     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3613 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3614       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3615       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3616         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3617         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3618         PetscCallCUSPARSE(
3619           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3620         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3621 
3622         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3623       } else {
3624         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3625         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3626         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3627       }
3628 
3629       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3630                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3631 #else
3632       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3633       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3634 #endif
3635     } else {
3636       if (cusparsestruct->nrows) {
3637 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3638         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3639 #else
3640         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3641         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3642 #endif
3643       }
3644     }
3645     PetscCall(PetscLogGpuTimeEnd());
3646 
3647     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3648       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3649         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3650           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3651         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3652           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3653         }
3654       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3655         PetscCall(VecSeq_CUDA::Set(zz, 0));
3656       }
3657 
3658       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3659       if (compressed) {
3660         PetscCall(PetscLogGpuTimeBegin());
3661         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3662            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3663            prevent that. So I just add a ScatterAdd kernel.
3664          */
3665 #if 0
3666         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3667         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3668                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3669                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3670                          VecCUDAPlusEquals());
3671 #else
3672         PetscInt n = matstruct->cprowIndices->size();
3673         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3674 #endif
3675         PetscCall(PetscLogGpuTimeEnd());
3676       }
3677     } else {
3678       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3679     }
3680     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3681     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3682     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3683   } catch (char *ex) {
3684     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3685   }
3686   if (yy) {
3687     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3688   } else {
3689     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3690   }
3691   PetscFunctionReturn(PETSC_SUCCESS);
3692 }
3693 
3694 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3695 {
3696   PetscFunctionBegin;
3697   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3698   PetscFunctionReturn(PETSC_SUCCESS);
3699 }
3700 
3701 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3702 {
3703   PetscFunctionBegin;
3704   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3705   PetscFunctionReturn(PETSC_SUCCESS);
3706 }
3707 
3708 /*@
3709    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3710    (the default parallel PETSc format). This matrix will ultimately pushed down
3711    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3712    assembly performance the user should preallocate the matrix storage by setting
3713    the parameter `nz` (or the array `nnz`).
3714 
3715    Collective
3716 
3717    Input Parameters:
3718 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3719 .  m - number of rows
3720 .  n - number of columns
3721 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3722 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3723 
3724    Output Parameter:
3725 .  A - the matrix
3726 
3727    Level: intermediate
3728 
3729    Notes:
3730    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3731    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3732    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3733 
3734    The AIJ format, also called
3735    compressed row storage, is fully compatible with standard Fortran
3736    storage.  That is, the stored row and column indices can begin at
3737    either one (as in Fortran) or zero.
3738 
3739    Specify the preallocated storage with either nz or nnz (not both).
3740    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3741    allocation.
3742 
3743 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3744 @*/
3745 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3746 {
3747   PetscFunctionBegin;
3748   PetscCall(MatCreate(comm, A));
3749   PetscCall(MatSetSizes(*A, m, n, m, n));
3750   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3751   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3752   PetscFunctionReturn(PETSC_SUCCESS);
3753 }
3754 
3755 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3756 {
3757   PetscFunctionBegin;
3758   if (A->factortype == MAT_FACTOR_NONE) {
3759     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3760   } else {
3761     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3762   }
3763   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3764   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3765   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3766   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3767   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3768   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3769   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3770   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3771   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3772   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3773   PetscCall(MatDestroy_SeqAIJ(A));
3774   PetscFunctionReturn(PETSC_SUCCESS);
3775 }
3776 
3777 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3778 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3779 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3780 {
3781   PetscFunctionBegin;
3782   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3783   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3784   PetscFunctionReturn(PETSC_SUCCESS);
3785 }
3786 
3787 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3788 {
3789   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3790   Mat_SeqAIJCUSPARSE *cy;
3791   Mat_SeqAIJCUSPARSE *cx;
3792   PetscScalar        *ay;
3793   const PetscScalar  *ax;
3794   CsrMatrix          *csry, *csrx;
3795 
3796   PetscFunctionBegin;
3797   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3798   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3799   if (X->ops->axpy != Y->ops->axpy) {
3800     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3801     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3802     PetscFunctionReturn(PETSC_SUCCESS);
3803   }
3804   /* if we are here, it means both matrices are bound to GPU */
3805   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3806   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3807   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3808   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3809   csry = (CsrMatrix *)cy->mat->mat;
3810   csrx = (CsrMatrix *)cx->mat->mat;
3811   /* see if we can turn this into a cublas axpy */
3812   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3813     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3814     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3815     if (eq) str = SAME_NONZERO_PATTERN;
3816   }
3817   /* spgeam is buggy with one column */
3818   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3819 
3820   if (str == SUBSET_NONZERO_PATTERN) {
3821     PetscScalar b = 1.0;
3822 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3823     size_t bufferSize;
3824     void  *buffer;
3825 #endif
3826 
3827     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3828     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3829     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3830 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3831     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3832                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3833     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3834     PetscCall(PetscLogGpuTimeBegin());
3835     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3836                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3837     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3838     PetscCall(PetscLogGpuTimeEnd());
3839     PetscCallCUDA(cudaFree(buffer));
3840 #else
3841     PetscCall(PetscLogGpuTimeBegin());
3842     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3843                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3844     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3845     PetscCall(PetscLogGpuTimeEnd());
3846 #endif
3847     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3848     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3849     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3850     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3851   } else if (str == SAME_NONZERO_PATTERN) {
3852     cublasHandle_t cublasv2handle;
3853     PetscBLASInt   one = 1, bnz = 1;
3854 
3855     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3856     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3857     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3858     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3859     PetscCall(PetscLogGpuTimeBegin());
3860     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3861     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3862     PetscCall(PetscLogGpuTimeEnd());
3863     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3864     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3865     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3866   } else {
3867     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3868     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3869   }
3870   PetscFunctionReturn(PETSC_SUCCESS);
3871 }
3872 
3873 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3874 {
3875   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3876   PetscScalar   *ay;
3877   cublasHandle_t cublasv2handle;
3878   PetscBLASInt   one = 1, bnz = 1;
3879 
3880   PetscFunctionBegin;
3881   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3882   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3883   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3884   PetscCall(PetscLogGpuTimeBegin());
3885   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3886   PetscCall(PetscLogGpuFlops(bnz));
3887   PetscCall(PetscLogGpuTimeEnd());
3888   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3889   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3890   PetscFunctionReturn(PETSC_SUCCESS);
3891 }
3892 
3893 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3894 {
3895   PetscBool   both = PETSC_FALSE;
3896   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3897 
3898   PetscFunctionBegin;
3899   if (A->factortype == MAT_FACTOR_NONE) {
3900     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3901     if (spptr->mat) {
3902       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3903       if (matrix->values) {
3904         both = PETSC_TRUE;
3905         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3906       }
3907     }
3908     if (spptr->matTranspose) {
3909       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3910       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3911     }
3912   }
3913   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3914   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3915   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3916   else A->offloadmask = PETSC_OFFLOAD_CPU;
3917   PetscFunctionReturn(PETSC_SUCCESS);
3918 }
3919 
3920 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3921 {
3922   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3923 
3924   PetscFunctionBegin;
3925   if (A->factortype != MAT_FACTOR_NONE) {
3926     A->boundtocpu = flg;
3927     PetscFunctionReturn(PETSC_SUCCESS);
3928   }
3929   if (flg) {
3930     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3931 
3932     A->ops->scale                     = MatScale_SeqAIJ;
3933     A->ops->axpy                      = MatAXPY_SeqAIJ;
3934     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3935     A->ops->mult                      = MatMult_SeqAIJ;
3936     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3937     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3938     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3939     A->ops->multhermitiantranspose    = NULL;
3940     A->ops->multhermitiantransposeadd = NULL;
3941     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3942     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3943     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3944     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3945     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3946     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3947     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3948     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3949   } else {
3950     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3951     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3952     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3953     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3954     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3955     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3956     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3957     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3958     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3959     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3960     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3961     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3962     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3963     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3964     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3965     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3966     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3967 
3968     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3969     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3970     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3971     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3972     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3973     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3974   }
3975   A->boundtocpu = flg;
3976   if (flg && a->inode.size) {
3977     a->inode.use = PETSC_TRUE;
3978   } else {
3979     a->inode.use = PETSC_FALSE;
3980   }
3981   PetscFunctionReturn(PETSC_SUCCESS);
3982 }
3983 
3984 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3985 {
3986   Mat B;
3987 
3988   PetscFunctionBegin;
3989   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3990   if (reuse == MAT_INITIAL_MATRIX) {
3991     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3992   } else if (reuse == MAT_REUSE_MATRIX) {
3993     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3994   }
3995   B = *newmat;
3996 
3997   PetscCall(PetscFree(B->defaultvectype));
3998   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3999 
4000   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4001     if (B->factortype == MAT_FACTOR_NONE) {
4002       Mat_SeqAIJCUSPARSE *spptr;
4003       PetscCall(PetscNew(&spptr));
4004       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4005       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4006       spptr->format = MAT_CUSPARSE_CSR;
4007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4008   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4009       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4010   #else
4011       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4012   #endif
4013       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4014       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4015 #endif
4016       B->spptr = spptr;
4017     } else {
4018       Mat_SeqAIJCUSPARSETriFactors *spptr;
4019 
4020       PetscCall(PetscNew(&spptr));
4021       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4022       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4023       B->spptr = spptr;
4024     }
4025     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4026   }
4027   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4028   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4029   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4030   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4031   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4032   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4033 
4034   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4035   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4036   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4037 #if defined(PETSC_HAVE_HYPRE)
4038   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4039 #endif
4040   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4041   PetscFunctionReturn(PETSC_SUCCESS);
4042 }
4043 
4044 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4045 {
4046   PetscFunctionBegin;
4047   PetscCall(MatCreate_SeqAIJ(B));
4048   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4049   PetscFunctionReturn(PETSC_SUCCESS);
4050 }
4051 
4052 /*MC
4053    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4054 
4055    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4056    CSR, ELL, or Hybrid format.
4057    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4058 
4059    Options Database Keys:
4060 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4061 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4062                                       Other options include ell (ellpack) or hyb (hybrid).
4063 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4064 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4065 
4066   Level: beginner
4067 
4068 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4069 M*/
4070 
4071 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4072 {
4073   PetscFunctionBegin;
4074   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4075   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4076   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4077   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4078 
4079   PetscFunctionReturn(PETSC_SUCCESS);
4080 }
4081 
4082 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4083 {
4084   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
4085 
4086   PetscFunctionBegin;
4087   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4088   delete cusp->cooPerm;
4089   delete cusp->cooPerm_a;
4090   cusp->cooPerm   = NULL;
4091   cusp->cooPerm_a = NULL;
4092   if (cusp->use_extended_coo) {
4093     PetscCallCUDA(cudaFree(cusp->jmap_d));
4094     PetscCallCUDA(cudaFree(cusp->perm_d));
4095   }
4096   cusp->use_extended_coo = PETSC_FALSE;
4097   PetscFunctionReturn(PETSC_SUCCESS);
4098 }
4099 
4100 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4101 {
4102   PetscFunctionBegin;
4103   if (*cusparsestruct) {
4104     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
4105     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
4106     delete (*cusparsestruct)->workVector;
4107     delete (*cusparsestruct)->rowoffsets_gpu;
4108     delete (*cusparsestruct)->cooPerm;
4109     delete (*cusparsestruct)->cooPerm_a;
4110     delete (*cusparsestruct)->csr2csc_i;
4111     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4112     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4113     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4114     PetscCall(PetscFree(*cusparsestruct));
4115   }
4116   PetscFunctionReturn(PETSC_SUCCESS);
4117 }
4118 
4119 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4120 {
4121   PetscFunctionBegin;
4122   if (*mat) {
4123     delete (*mat)->values;
4124     delete (*mat)->column_indices;
4125     delete (*mat)->row_offsets;
4126     delete *mat;
4127     *mat = 0;
4128   }
4129   PetscFunctionReturn(PETSC_SUCCESS);
4130 }
4131 
4132 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4133 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4134 {
4135   PetscFunctionBegin;
4136   if (*trifactor) {
4137     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4138     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4139     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4140     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4141     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4142   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4143     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4144   #endif
4145     PetscCall(PetscFree(*trifactor));
4146   }
4147   PetscFunctionReturn(PETSC_SUCCESS);
4148 }
4149 #endif
4150 
4151 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4152 {
4153   CsrMatrix *mat;
4154 
4155   PetscFunctionBegin;
4156   if (*matstruct) {
4157     if ((*matstruct)->mat) {
4158       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4160         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4161 #else
4162         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4163         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4164 #endif
4165       } else {
4166         mat = (CsrMatrix *)(*matstruct)->mat;
4167         PetscCall(CsrMatrix_Destroy(&mat));
4168       }
4169     }
4170     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4171     delete (*matstruct)->cprowIndices;
4172     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4173     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4174     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4175 
4176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4177     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4178     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4179     for (int i = 0; i < 3; i++) {
4180       if (mdata->cuSpMV[i].initialized) {
4181         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4182         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4183         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4184       }
4185     }
4186 #endif
4187     delete *matstruct;
4188     *matstruct = NULL;
4189   }
4190   PetscFunctionReturn(PETSC_SUCCESS);
4191 }
4192 
4193 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4194 {
4195   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4196 
4197   PetscFunctionBegin;
4198   if (fs) {
4199 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4200     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4201     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4202     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4203     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4204     delete fs->workVector;
4205     fs->workVector = NULL;
4206 #endif
4207     delete fs->rpermIndices;
4208     delete fs->cpermIndices;
4209     fs->rpermIndices  = NULL;
4210     fs->cpermIndices  = NULL;
4211     fs->init_dev_prop = PETSC_FALSE;
4212 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4213     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4214     PetscCallCUDA(cudaFree(fs->csrColIdx));
4215     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4216     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4217     PetscCallCUDA(cudaFree(fs->csrVal));
4218     PetscCallCUDA(cudaFree(fs->diag));
4219     PetscCallCUDA(cudaFree(fs->X));
4220     PetscCallCUDA(cudaFree(fs->Y));
4221     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4222     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4223     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4224     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4225     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4226     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4227     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4228     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4229     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4230     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4231     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4232     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4233     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4234     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4235     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4236     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4237     PetscCall(PetscFree(fs->csrRowPtr_h));
4238     PetscCall(PetscFree(fs->csrVal_h));
4239     PetscCall(PetscFree(fs->diag_h));
4240     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4241     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4242 #endif
4243   }
4244   PetscFunctionReturn(PETSC_SUCCESS);
4245 }
4246 
4247 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4248 {
4249   PetscFunctionBegin;
4250   if (*trifactors) {
4251     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4252     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4253     PetscCall(PetscFree(*trifactors));
4254   }
4255   PetscFunctionReturn(PETSC_SUCCESS);
4256 }
4257 
4258 struct IJCompare {
4259   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4260   {
4261     if (t1.get<0>() < t2.get<0>()) return true;
4262     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4263     return false;
4264   }
4265 };
4266 
4267 struct IJEqual {
4268   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4269   {
4270     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4271     return true;
4272   }
4273 };
4274 
4275 struct IJDiff {
4276   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
4277 };
4278 
4279 struct IJSum {
4280   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4281 };
4282 
4283 #include <thrust/iterator/discard_iterator.h>
4284 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4285 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4286 {
4287   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4288   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4289   THRUSTARRAY                          *cooPerm_v = NULL;
4290   thrust::device_ptr<const PetscScalar> d_v;
4291   CsrMatrix                            *matrix;
4292   PetscInt                              n;
4293 
4294   PetscFunctionBegin;
4295   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4296   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4297   if (!cusp->cooPerm) {
4298     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4299     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4300     PetscFunctionReturn(PETSC_SUCCESS);
4301   }
4302   matrix = (CsrMatrix *)cusp->mat->mat;
4303   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4304   if (!v) {
4305     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4306     goto finalize;
4307   }
4308   n = cusp->cooPerm->size();
4309   if (isCudaMem(v)) {
4310     d_v = thrust::device_pointer_cast(v);
4311   } else {
4312     cooPerm_v = new THRUSTARRAY(n);
4313     cooPerm_v->assign(v, v + n);
4314     d_v = cooPerm_v->data();
4315     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4316   }
4317   PetscCall(PetscLogGpuTimeBegin());
4318   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4319     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4320       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4321       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4322       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4323         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4324         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4325       */
4326       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4327       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4328       delete cooPerm_w;
4329     } else {
4330       /* all nonzeros in d_v[] are unique entries */
4331       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4332       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4333       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4334     }
4335   } else {
4336     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4337       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4338       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4339     } else {
4340       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4341       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4342       thrust::for_each(zibit, zieit, VecCUDAEquals());
4343     }
4344   }
4345   PetscCall(PetscLogGpuTimeEnd());
4346 finalize:
4347   delete cooPerm_v;
4348   A->offloadmask = PETSC_OFFLOAD_GPU;
4349   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4350   /* shorter version of MatAssemblyEnd_SeqAIJ */
4351   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4352   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4353   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4354   a->reallocs = 0;
4355   A->info.mallocs += 0;
4356   A->info.nz_unneeded = 0;
4357   A->assembled = A->was_assembled = PETSC_TRUE;
4358   A->num_ass++;
4359   PetscFunctionReturn(PETSC_SUCCESS);
4360 }
4361 
4362 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4363 {
4364   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4365 
4366   PetscFunctionBegin;
4367   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4368   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4369   if (destroy) {
4370     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4371     delete cusp->csr2csc_i;
4372     cusp->csr2csc_i = NULL;
4373   }
4374   A->transupdated = PETSC_FALSE;
4375   PetscFunctionReturn(PETSC_SUCCESS);
4376 }
4377 
4378 #include <thrust/binary_search.h>
4379 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4380 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4381 {
4382   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4383   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4384   PetscInt            cooPerm_n, nzr = 0;
4385 
4386   PetscFunctionBegin;
4387   PetscCall(PetscLayoutSetUp(A->rmap));
4388   PetscCall(PetscLayoutSetUp(A->cmap));
4389   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4390   if (n != cooPerm_n) {
4391     delete cusp->cooPerm;
4392     delete cusp->cooPerm_a;
4393     cusp->cooPerm   = NULL;
4394     cusp->cooPerm_a = NULL;
4395   }
4396   if (n) {
4397     thrust::device_ptr<PetscInt> d_i, d_j;
4398     PetscInt                    *d_raw_i, *d_raw_j;
4399     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4400     PetscMemType                 imtype, jmtype;
4401 
4402     PetscCall(PetscGetMemType(coo_i, &imtype));
4403     if (PetscMemTypeHost(imtype)) {
4404       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4405       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4406       d_i        = thrust::device_pointer_cast(d_raw_i);
4407       free_raw_i = PETSC_TRUE;
4408       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4409     } else {
4410       d_i = thrust::device_pointer_cast(coo_i);
4411     }
4412 
4413     PetscCall(PetscGetMemType(coo_j, &jmtype));
4414     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4415       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4416       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4417       d_j        = thrust::device_pointer_cast(d_raw_j);
4418       free_raw_j = PETSC_TRUE;
4419       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4420     } else {
4421       d_j = thrust::device_pointer_cast(coo_j);
4422     }
4423 
4424     THRUSTINTARRAY ii(A->rmap->n);
4425 
4426     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4427     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4428 
4429     /* Ex.
4430       n = 6
4431       coo_i = [3,3,1,4,1,4]
4432       coo_j = [3,2,2,5,2,6]
4433     */
4434     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4435     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4436 
4437     PetscCall(PetscLogGpuTimeBegin());
4438     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4439     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4440     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4441     THRUSTINTARRAY w(d_j, d_j + n);
4442 
4443     /*
4444       d_i     = [1,1,3,3,4,4]
4445       d_j     = [2,2,2,3,5,6]
4446       cooPerm = [2,4,1,0,3,5]
4447     */
4448     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4449 
4450     /*
4451       d_i     = [1,3,3,4,4,x]
4452                             ^ekey
4453       d_j     = [2,2,3,5,6,x]
4454                            ^nekye
4455     */
4456     if (nekey == ekey) { /* all entries are unique */
4457       delete cusp->cooPerm_a;
4458       cusp->cooPerm_a = NULL;
4459     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4460       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4461       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4462       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4463       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4464       w[0]                  = 0;
4465       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4466       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4467     }
4468     thrust::counting_iterator<PetscInt> search_begin(0);
4469     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4470                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4471                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4472     PetscCall(PetscLogGpuTimeEnd());
4473 
4474     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4475     a->singlemalloc = PETSC_FALSE;
4476     a->free_a       = PETSC_TRUE;
4477     a->free_ij      = PETSC_TRUE;
4478     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4479     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4480     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4481     a->nz = a->maxnz = a->i[A->rmap->n];
4482     a->rmax          = 0;
4483     PetscCall(PetscMalloc1(a->nz, &a->a));
4484     PetscCall(PetscMalloc1(a->nz, &a->j));
4485     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4486     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4487     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4488     for (PetscInt i = 0; i < A->rmap->n; i++) {
4489       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4490       nzr += (PetscInt) !!(nnzr);
4491       a->ilen[i] = a->imax[i] = nnzr;
4492       a->rmax                 = PetscMax(a->rmax, nnzr);
4493     }
4494     a->nonzerorowcnt = nzr;
4495     A->preallocated  = PETSC_TRUE;
4496     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4497     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4498     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4499     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4500   } else {
4501     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4502   }
4503   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4504 
4505   /* We want to allocate the CUSPARSE struct for matvec now.
4506      The code is so convoluted now that I prefer to copy zeros */
4507   PetscCall(PetscArrayzero(a->a, a->nz));
4508   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4509   A->offloadmask = PETSC_OFFLOAD_CPU;
4510   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4511   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4512   PetscFunctionReturn(PETSC_SUCCESS);
4513 }
4514 
4515 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4516 {
4517   Mat_SeqAIJ         *seq;
4518   Mat_SeqAIJCUSPARSE *dev;
4519   PetscBool           coo_basic = PETSC_TRUE;
4520   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4521 
4522   PetscFunctionBegin;
4523   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4524   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4525   if (coo_i) {
4526     PetscCall(PetscGetMemType(coo_i, &mtype));
4527     if (PetscMemTypeHost(mtype)) {
4528       for (PetscCount k = 0; k < coo_n; k++) {
4529         if (coo_i[k] < 0 || coo_j[k] < 0) {
4530           coo_basic = PETSC_FALSE;
4531           break;
4532         }
4533       }
4534     }
4535   }
4536 
4537   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4538     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4539   } else {
4540     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4541     mat->offloadmask = PETSC_OFFLOAD_CPU;
4542     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4543     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4544     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4545     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4546     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4547     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4548     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4549     dev->use_extended_coo = PETSC_TRUE;
4550   }
4551   PetscFunctionReturn(PETSC_SUCCESS);
4552 }
4553 
4554 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4555 {
4556   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4557   const PetscCount grid_size = gridDim.x * blockDim.x;
4558   for (; i < nnz; i += grid_size) {
4559     PetscScalar sum = 0.0;
4560     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4561     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4562   }
4563 }
4564 
4565 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4566 {
4567   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4568   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4569   PetscCount          Annz = seq->nz;
4570   PetscMemType        memtype;
4571   const PetscScalar  *v1 = v;
4572   PetscScalar        *Aa;
4573 
4574   PetscFunctionBegin;
4575   if (dev->use_extended_coo) {
4576     PetscCall(PetscGetMemType(v, &memtype));
4577     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4578       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4579       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4580     }
4581 
4582     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4583     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4584 
4585     if (Annz) {
4586       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4587       PetscCallCUDA(cudaPeekAtLastError());
4588     }
4589 
4590     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4591     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4592 
4593     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4594   } else {
4595     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4596   }
4597   PetscFunctionReturn(PETSC_SUCCESS);
4598 }
4599 
4600 /*@C
4601     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4602 
4603    Not Collective
4604 
4605     Input Parameters:
4606 +   A - the matrix
4607 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4608 
4609     Output Parameters:
4610 +   i - the CSR row pointers
4611 -   j - the CSR column indices
4612 
4613     Level: developer
4614 
4615     Note:
4616       When compressed is true, the CSR structure does not contain empty rows
4617 
4618 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4619 @*/
4620 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4621 {
4622   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4623   CsrMatrix          *csr;
4624   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4625 
4626   PetscFunctionBegin;
4627   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4628   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4629   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4630   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4631   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4632   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4633   csr = (CsrMatrix *)cusp->mat->mat;
4634   if (i) {
4635     if (!compressed && a->compressedrow.use) { /* need full row offset */
4636       if (!cusp->rowoffsets_gpu) {
4637         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4638         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4639         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4640       }
4641       *i = cusp->rowoffsets_gpu->data().get();
4642     } else *i = csr->row_offsets->data().get();
4643   }
4644   if (j) *j = csr->column_indices->data().get();
4645   PetscFunctionReturn(PETSC_SUCCESS);
4646 }
4647 
4648 /*@C
4649     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4650 
4651    Not Collective
4652 
4653     Input Parameters:
4654 +   A - the matrix
4655 .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4656 .   i - the CSR row pointers
4657 -   j - the CSR column indices
4658 
4659     Level: developer
4660 
4661 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4662 @*/
4663 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4664 {
4665   PetscFunctionBegin;
4666   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4667   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4668   if (i) *i = NULL;
4669   if (j) *j = NULL;
4670   (void)compressed;
4671   PetscFunctionReturn(PETSC_SUCCESS);
4672 }
4673 
4674 /*@C
4675    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4676 
4677    Not Collective
4678 
4679    Input Parameter:
4680 .   A - a `MATSEQAIJCUSPARSE` matrix
4681 
4682    Output Parameter:
4683 .   a - pointer to the device data
4684 
4685    Level: developer
4686 
4687    Note:
4688    May trigger host-device copies if up-to-date matrix data is on host
4689 
4690 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4691 @*/
4692 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4693 {
4694   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4695   CsrMatrix          *csr;
4696 
4697   PetscFunctionBegin;
4698   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4699   PetscValidPointer(a, 2);
4700   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4701   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4702   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4703   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4704   csr = (CsrMatrix *)cusp->mat->mat;
4705   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4706   *a = csr->values->data().get();
4707   PetscFunctionReturn(PETSC_SUCCESS);
4708 }
4709 
4710 /*@C
4711    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4712 
4713    Not Collective
4714 
4715    Input Parameters:
4716 +   A - a `MATSEQAIJCUSPARSE` matrix
4717 -   a - pointer to the device data
4718 
4719    Level: developer
4720 
4721 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4722 @*/
4723 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4724 {
4725   PetscFunctionBegin;
4726   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4727   PetscValidPointer(a, 2);
4728   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4729   *a = NULL;
4730   PetscFunctionReturn(PETSC_SUCCESS);
4731 }
4732 
4733 /*@C
4734    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4735 
4736    Not Collective
4737 
4738    Input Parameter:
4739 .   A - a `MATSEQAIJCUSPARSE` matrix
4740 
4741    Output Parameter:
4742 .   a - pointer to the device data
4743 
4744    Level: developer
4745 
4746    Note:
4747    May trigger host-device copies if up-to-date matrix data is on host
4748 
4749 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4750 @*/
4751 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4752 {
4753   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4754   CsrMatrix          *csr;
4755 
4756   PetscFunctionBegin;
4757   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4758   PetscValidPointer(a, 2);
4759   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4760   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4761   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4762   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4763   csr = (CsrMatrix *)cusp->mat->mat;
4764   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4765   *a             = csr->values->data().get();
4766   A->offloadmask = PETSC_OFFLOAD_GPU;
4767   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4768   PetscFunctionReturn(PETSC_SUCCESS);
4769 }
4770 /*@C
4771    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4772 
4773    Not Collective
4774 
4775    Input Parameters:
4776 +   A - a `MATSEQAIJCUSPARSE` matrix
4777 -   a - pointer to the device data
4778 
4779    Level: developer
4780 
4781 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4782 @*/
4783 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4784 {
4785   PetscFunctionBegin;
4786   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4787   PetscValidPointer(a, 2);
4788   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4789   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4790   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4791   *a = NULL;
4792   PetscFunctionReturn(PETSC_SUCCESS);
4793 }
4794 
4795 /*@C
4796    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4797 
4798    Not Collective
4799 
4800    Input Parameter:
4801 .   A - a `MATSEQAIJCUSPARSE` matrix
4802 
4803    Output Parameter:
4804 .   a - pointer to the device data
4805 
4806    Level: developer
4807 
4808    Note:
4809    Does not trigger host-device copies and flags data validity on the GPU
4810 
4811 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4812 @*/
4813 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4814 {
4815   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4816   CsrMatrix          *csr;
4817 
4818   PetscFunctionBegin;
4819   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4820   PetscValidPointer(a, 2);
4821   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4822   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4823   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4824   csr = (CsrMatrix *)cusp->mat->mat;
4825   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4826   *a             = csr->values->data().get();
4827   A->offloadmask = PETSC_OFFLOAD_GPU;
4828   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4829   PetscFunctionReturn(PETSC_SUCCESS);
4830 }
4831 
4832 /*@C
4833    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4834 
4835    Not Collective
4836 
4837    Input Parameters:
4838 +   A - a `MATSEQAIJCUSPARSE` matrix
4839 -   a - pointer to the device data
4840 
4841    Level: developer
4842 
4843 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4844 @*/
4845 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4846 {
4847   PetscFunctionBegin;
4848   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4849   PetscValidPointer(a, 2);
4850   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4851   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4852   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4853   *a = NULL;
4854   PetscFunctionReturn(PETSC_SUCCESS);
4855 }
4856 
4857 struct IJCompare4 {
4858   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4859   {
4860     if (t1.get<0>() < t2.get<0>()) return true;
4861     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4862     return false;
4863   }
4864 };
4865 
4866 struct Shift {
4867   int _shift;
4868 
4869   Shift(int shift) : _shift(shift) { }
4870   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4871 };
4872 
4873 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4874 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4875 {
4876   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4877   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4878   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4879   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4880   PetscInt                      Annz, Bnnz;
4881   cusparseStatus_t              stat;
4882   PetscInt                      i, m, n, zero = 0;
4883 
4884   PetscFunctionBegin;
4885   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4886   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4887   PetscValidPointer(C, 4);
4888   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4889   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4890   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4891   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4892   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4893   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4894   if (reuse == MAT_INITIAL_MATRIX) {
4895     m = A->rmap->n;
4896     n = A->cmap->n + B->cmap->n;
4897     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4898     PetscCall(MatSetSizes(*C, m, n, m, n));
4899     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4900     c                       = (Mat_SeqAIJ *)(*C)->data;
4901     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4902     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4903     Ccsr                    = new CsrMatrix;
4904     Cmat->cprowIndices      = NULL;
4905     c->compressedrow.use    = PETSC_FALSE;
4906     c->compressedrow.nrows  = 0;
4907     c->compressedrow.i      = NULL;
4908     c->compressedrow.rindex = NULL;
4909     Ccusp->workVector       = NULL;
4910     Ccusp->nrows            = m;
4911     Ccusp->mat              = Cmat;
4912     Ccusp->mat->mat         = Ccsr;
4913     Ccsr->num_rows          = m;
4914     Ccsr->num_cols          = n;
4915     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4916     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4917     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4918     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4919     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4920     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4921     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4922     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4923     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4924     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4925     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4926     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4927     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4928 
4929     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4930     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4931     Annz                 = (PetscInt)Acsr->column_indices->size();
4932     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4933     c->nz                = Annz + Bnnz;
4934     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4935     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4936     Ccsr->values         = new THRUSTARRAY(c->nz);
4937     Ccsr->num_entries    = c->nz;
4938     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4939     if (c->nz) {
4940       auto              Acoo = new THRUSTINTARRAY32(Annz);
4941       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4942       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4943       THRUSTINTARRAY32 *Aroff, *Broff;
4944 
4945       if (a->compressedrow.use) { /* need full row offset */
4946         if (!Acusp->rowoffsets_gpu) {
4947           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4948           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4949           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4950         }
4951         Aroff = Acusp->rowoffsets_gpu;
4952       } else Aroff = Acsr->row_offsets;
4953       if (b->compressedrow.use) { /* need full row offset */
4954         if (!Bcusp->rowoffsets_gpu) {
4955           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4956           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4957           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4958         }
4959         Broff = Bcusp->rowoffsets_gpu;
4960       } else Broff = Bcsr->row_offsets;
4961       PetscCall(PetscLogGpuTimeBegin());
4962       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4963       PetscCallCUSPARSE(stat);
4964       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4965       PetscCallCUSPARSE(stat);
4966       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4967       auto Aperm = thrust::make_constant_iterator(1);
4968       auto Bperm = thrust::make_constant_iterator(0);
4969 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4970       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4971       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4972 #else
4973       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4974       auto Bcib = Bcsr->column_indices->begin();
4975       auto Bcie = Bcsr->column_indices->end();
4976       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4977 #endif
4978       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4979       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4980       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4981       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4982       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4983       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4984       auto p1    = Ccusp->cooPerm->begin();
4985       auto p2    = Ccusp->cooPerm->begin();
4986       thrust::advance(p2, Annz);
4987       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4988 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4989       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4990 #endif
4991       auto cci = thrust::make_counting_iterator(zero);
4992       auto cce = thrust::make_counting_iterator(c->nz);
4993 #if 0 //Errors on SUMMIT cuda 11.1.0
4994       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4995 #else
4996       auto pred = thrust::identity<int>();
4997       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4998       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4999 #endif
5000       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
5001       PetscCallCUSPARSE(stat);
5002       PetscCall(PetscLogGpuTimeEnd());
5003       delete wPerm;
5004       delete Acoo;
5005       delete Bcoo;
5006       delete Ccoo;
5007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5008       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5009       PetscCallCUSPARSE(stat);
5010 #endif
5011       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5012         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5013         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5014         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5015         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5016         CsrMatrix                    *CcsrT = new CsrMatrix;
5017         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5018         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5019 
5020         (*C)->form_explicit_transpose = PETSC_TRUE;
5021         (*C)->transupdated            = PETSC_TRUE;
5022         Ccusp->rowoffsets_gpu         = NULL;
5023         CmatT->cprowIndices           = NULL;
5024         CmatT->mat                    = CcsrT;
5025         CcsrT->num_rows               = n;
5026         CcsrT->num_cols               = m;
5027         CcsrT->num_entries            = c->nz;
5028 
5029         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
5030         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5031         CcsrT->values         = new THRUSTARRAY(c->nz);
5032 
5033         PetscCall(PetscLogGpuTimeBegin());
5034         auto rT = CcsrT->row_offsets->begin();
5035         if (AT) {
5036           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
5037           thrust::advance(rT, -1);
5038         }
5039         if (BT) {
5040           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
5041           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
5042           thrust::copy(titb, tite, rT);
5043         }
5044         auto cT = CcsrT->column_indices->begin();
5045         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
5046         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
5047         auto vT = CcsrT->values->begin();
5048         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5049         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5050         PetscCall(PetscLogGpuTimeEnd());
5051 
5052         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5053         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5054         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5055         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
5056         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
5057         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5058         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5059         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5060         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5061 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5062         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5063         PetscCallCUSPARSE(stat);
5064 #endif
5065         Ccusp->matTranspose = CmatT;
5066       }
5067     }
5068 
5069     c->singlemalloc = PETSC_FALSE;
5070     c->free_a       = PETSC_TRUE;
5071     c->free_ij      = PETSC_TRUE;
5072     PetscCall(PetscMalloc1(m + 1, &c->i));
5073     PetscCall(PetscMalloc1(c->nz, &c->j));
5074     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
5075       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5076       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5077       ii = *Ccsr->row_offsets;
5078       jj = *Ccsr->column_indices;
5079       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5080       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5081     } else {
5082       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5083       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5084     }
5085     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
5086     PetscCall(PetscMalloc1(m, &c->ilen));
5087     PetscCall(PetscMalloc1(m, &c->imax));
5088     c->maxnz         = c->nz;
5089     c->nonzerorowcnt = 0;
5090     c->rmax          = 0;
5091     for (i = 0; i < m; i++) {
5092       const PetscInt nn = c->i[i + 1] - c->i[i];
5093       c->ilen[i] = c->imax[i] = nn;
5094       c->nonzerorowcnt += (PetscInt) !!nn;
5095       c->rmax = PetscMax(c->rmax, nn);
5096     }
5097     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5098     PetscCall(PetscMalloc1(c->nz, &c->a));
5099     (*C)->nonzerostate++;
5100     PetscCall(PetscLayoutSetUp((*C)->rmap));
5101     PetscCall(PetscLayoutSetUp((*C)->cmap));
5102     Ccusp->nonzerostate = (*C)->nonzerostate;
5103     (*C)->preallocated  = PETSC_TRUE;
5104   } else {
5105     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5106     c = (Mat_SeqAIJ *)(*C)->data;
5107     if (c->nz) {
5108       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
5109       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
5110       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
5111       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
5112       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5113       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5114       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5115       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5116       Acsr = (CsrMatrix *)Acusp->mat->mat;
5117       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5118       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5119       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5120       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5121       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5122       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
5123       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
5124       auto pmid = Ccusp->cooPerm->begin();
5125       thrust::advance(pmid, Acsr->num_entries);
5126       PetscCall(PetscLogGpuTimeBegin());
5127       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
5128       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5129       thrust::for_each(zibait, zieait, VecCUDAEquals());
5130       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5131       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
5132       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5133       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5134       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5135         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5136         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5137         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5138         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5139         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5140         auto       vT    = CcsrT->values->begin();
5141         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5142         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5143         (*C)->transupdated = PETSC_TRUE;
5144       }
5145       PetscCall(PetscLogGpuTimeEnd());
5146     }
5147   }
5148   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5149   (*C)->assembled     = PETSC_TRUE;
5150   (*C)->was_assembled = PETSC_FALSE;
5151   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5152   PetscFunctionReturn(PETSC_SUCCESS);
5153 }
5154 
5155 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5156 {
5157   bool               dmem;
5158   const PetscScalar *av;
5159 
5160   PetscFunctionBegin;
5161   dmem = isCudaMem(v);
5162   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5163   if (n && idx) {
5164     THRUSTINTARRAY widx(n);
5165     widx.assign(idx, idx + n);
5166     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5167 
5168     THRUSTARRAY                    *w = NULL;
5169     thrust::device_ptr<PetscScalar> dv;
5170     if (dmem) {
5171       dv = thrust::device_pointer_cast(v);
5172     } else {
5173       w  = new THRUSTARRAY(n);
5174       dv = w->data();
5175     }
5176     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5177 
5178     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5179     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5180     thrust::for_each(zibit, zieit, VecCUDAEquals());
5181     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5182     delete w;
5183   } else {
5184     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5185   }
5186   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5187   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5188   PetscFunctionReturn(PETSC_SUCCESS);
5189 }
5190