xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2fe279fdf3e687a416e4eadb7d3c7a82d60442c6)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0)
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72 #endif
73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
83 
84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95 
96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97 {
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
99 
100   PetscFunctionBegin;
101   switch (op) {
102   case MAT_CUSPARSE_MULT:
103     cusparsestruct->format = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparsestruct->format = format;
107     break;
108   default:
109     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*@
115    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116    operation. Only the `MatMult()` operation can use different GPU storage formats
117 
118    Not Collective
119 
120    Input Parameters:
121 +  A - Matrix of type `MATSEQAIJCUSPARSE`
122 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125 
126    Level: intermediate
127 
128 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149 
150    Input Parameters:
151 +  A - Matrix of type `MATSEQAIJCUSPARSE`
152 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
153 
154    Level: intermediate
155 
156    Note:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162 @*/
163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164 {
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172 {
173   PetscFunctionBegin;
174   switch (op) {
175   case MAT_FORM_EXPLICIT_TRANSPOSE:
176     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178     A->form_explicit_transpose = flg;
179     break;
180   default:
181     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182     break;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188 {
189   MatCUSPARSEStorageFormat format;
190   PetscBool                flg;
191   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
192 
193   PetscFunctionBegin;
194   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195   if (A->factortype == MAT_FACTOR_NONE) {
196     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198 
199     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206   #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
207     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208   #else
209     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210   #endif
211     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213 
214     PetscCall(
215       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217 #endif
218   }
219   PetscOptionsHeadEnd();
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225 {
226   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227   PetscInt                      m  = A->rmap->n;
228   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230   const MatScalar              *Aa = a->a;
231   PetscInt                     *Mi, *Mj, Mnz;
232   PetscScalar                  *Ma;
233 
234   PetscFunctionBegin;
235   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239       PetscCall(PetscMalloc1(m + 1, &Mi));
240       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241       PetscCall(PetscMalloc1(Mnz, &Ma));
242       Mi[0] = 0;
243       for (PetscInt i = 0; i < m; i++) {
244         PetscInt llen = Ai[i + 1] - Ai[i];
245         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249         Mi[i + 1] = Mi[i] + llen + ulen;
250       }
251       // Copy M (L,U) from host to device
252       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
257 
258       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266 
267       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270 
271       fillMode = CUSPARSE_FILL_MODE_UPPER;
272       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276 
277       // Allocate work vectors in SpSv
278       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
280 
281       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283 
284       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291 
292       // Record for reuse
293       fs->csrRowPtr_h = Mi;
294       fs->csrVal_h    = Ma;
295       PetscCall(PetscFree(Mj));
296     }
297     // Copy the value
298     Mi  = fs->csrRowPtr_h;
299     Ma  = fs->csrVal_h;
300     Mnz = Mi[m];
301     for (PetscInt i = 0; i < m; i++) {
302       PetscInt llen = Ai[i + 1] - Ai[i];
303       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307     }
308     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309 
310     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312 
313     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314 
315     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 #else
321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322 {
323   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324   PetscInt                           n                  = A->rmap->n;
325   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328   const MatScalar                   *aa = a->a, *v;
329   PetscInt                          *AiLo, *AjLo;
330   PetscInt                           i, nz, nzLower, offset, rowOffset;
331 
332   PetscFunctionBegin;
333   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335     try {
336       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337       nzLower = n + ai[n] - ai[1];
338       if (!loTriFactor) {
339         PetscScalar *AALo;
340 
341         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
342 
343         /* Allocate Space for the lower triangular matrix */
344         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
346 
347         /* Fill the lower triangular matrix */
348         AiLo[0]   = (PetscInt)0;
349         AiLo[n]   = nzLower;
350         AjLo[0]   = (PetscInt)0;
351         AALo[0]   = (MatScalar)1.0;
352         v         = aa;
353         vi        = aj;
354         offset    = 1;
355         rowOffset = 1;
356         for (i = 1; i < n; i++) {
357           nz = ai[i + 1] - ai[i];
358           /* additional 1 for the term on the diagonal */
359           AiLo[i] = rowOffset;
360           rowOffset += nz + 1;
361 
362           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
364 
365           offset += nz;
366           AjLo[offset] = (PetscInt)i;
367           AALo[offset] = (MatScalar)1.0;
368           offset += 1;
369 
370           v += nz;
371           vi += nz;
372         }
373 
374         /* allocate space for the triangular factor information */
375         PetscCall(PetscNew(&loTriFactor));
376         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377         /* Create the matrix description */
378         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382   #else
383         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384   #endif
385         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387 
388         /* set the operation */
389         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390 
391         /* set the matrix */
392         loTriFactor->csrMat              = new CsrMatrix;
393         loTriFactor->csrMat->num_rows    = n;
394         loTriFactor->csrMat->num_cols    = n;
395         loTriFactor->csrMat->num_entries = nzLower;
396 
397         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399 
400         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402 
403         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405 
406         /* Create the solve analysis information */
407         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413   #endif
414 
415         /* perform the solve analysis */
416         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418         PetscCallCUDA(WaitForCUDA());
419         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420 
421         /* assign the pointer */
422         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423         loTriFactor->AA_h                                          = AALo;
424         PetscCallCUDA(cudaFreeHost(AiLo));
425         PetscCallCUDA(cudaFreeHost(AjLo));
426         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427       } else { /* update values only */
428         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429         /* Fill the lower triangular matrix */
430         loTriFactor->AA_h[0] = 1.0;
431         v                    = aa;
432         vi                   = aj;
433         offset               = 1;
434         for (i = 1; i < n; i++) {
435           nz = ai[i + 1] - ai[i];
436           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437           offset += nz;
438           loTriFactor->AA_h[offset] = 1.0;
439           offset += 1;
440           v += nz;
441         }
442         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444       }
445     } catch (char *ex) {
446       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447     }
448   }
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453 {
454   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455   PetscInt                           n                  = A->rmap->n;
456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459   const MatScalar                   *aa = a->a, *v;
460   PetscInt                          *AiUp, *AjUp;
461   PetscInt                           i, nz, nzUpper, offset;
462 
463   PetscFunctionBegin;
464   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466     try {
467       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468       nzUpper = adiag[0] - adiag[n];
469       if (!upTriFactor) {
470         PetscScalar *AAUp;
471 
472         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
473 
474         /* Allocate Space for the upper triangular matrix */
475         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
477 
478         /* Fill the upper triangular matrix */
479         AiUp[0] = (PetscInt)0;
480         AiUp[n] = nzUpper;
481         offset  = nzUpper;
482         for (i = n - 1; i >= 0; i--) {
483           v  = aa + adiag[i + 1] + 1;
484           vi = aj + adiag[i + 1] + 1;
485 
486           /* number of elements NOT on the diagonal */
487           nz = adiag[i] - adiag[i + 1] - 1;
488 
489           /* decrement the offset */
490           offset -= (nz + 1);
491 
492           /* first, set the diagonal elements */
493           AjUp[offset] = (PetscInt)i;
494           AAUp[offset] = (MatScalar)1. / v[nz];
495           AiUp[i]      = AiUp[i + 1] - (nz + 1);
496 
497           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499         }
500 
501         /* allocate space for the triangular factor information */
502         PetscCall(PetscNew(&upTriFactor));
503         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
504 
505         /* Create the matrix description */
506         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510   #else
511         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512   #endif
513         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515 
516         /* set the operation */
517         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518 
519         /* set the matrix */
520         upTriFactor->csrMat              = new CsrMatrix;
521         upTriFactor->csrMat->num_rows    = n;
522         upTriFactor->csrMat->num_cols    = n;
523         upTriFactor->csrMat->num_entries = nzUpper;
524 
525         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527 
528         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530 
531         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533 
534         /* Create the solve analysis information */
535         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541   #endif
542 
543         /* perform the solve analysis */
544         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
546 
547         PetscCallCUDA(WaitForCUDA());
548         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549 
550         /* assign the pointer */
551         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552         upTriFactor->AA_h                                          = AAUp;
553         PetscCallCUDA(cudaFreeHost(AiUp));
554         PetscCallCUDA(cudaFreeHost(AjUp));
555         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556       } else {
557         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558         /* Fill the upper triangular matrix */
559         offset = nzUpper;
560         for (i = n - 1; i >= 0; i--) {
561           v = aa + adiag[i + 1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i + 1] - 1;
565 
566           /* decrement the offset */
567           offset -= (nz + 1);
568 
569           /* first, set the diagonal elements */
570           upTriFactor->AA_h[offset] = 1. / v[nz];
571           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572         }
573         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575       }
576     } catch (char *ex) {
577       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578     }
579   }
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 #endif
583 
584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585 {
586   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588   IS                            isrow = a->row, iscol = a->icol;
589   PetscBool                     row_identity, col_identity;
590   PetscInt                      n = A->rmap->n;
591 
592   PetscFunctionBegin;
593   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
595   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596 #else
597   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600 #endif
601 
602   cusparseTriFactors->nnz = a->nz;
603 
604   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605   /* lower triangular indices */
606   PetscCall(ISIdentity(isrow, &row_identity));
607   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608     const PetscInt *r;
609 
610     PetscCall(ISGetIndices(isrow, &r));
611     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612     cusparseTriFactors->rpermIndices->assign(r, r + n);
613     PetscCall(ISRestoreIndices(isrow, &r));
614     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615   }
616 
617   /* upper triangular indices */
618   PetscCall(ISIdentity(iscol, &col_identity));
619   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620     const PetscInt *c;
621 
622     PetscCall(ISGetIndices(iscol, &c));
623     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624     cusparseTriFactors->cpermIndices->assign(c, c + n);
625     PetscCall(ISRestoreIndices(iscol, &c));
626     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633 {
634   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635   PetscInt                      m  = A->rmap->n;
636   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638   const MatScalar              *Aa = a->a;
639   PetscInt                     *Mj, Mnz;
640   PetscScalar                  *Ma, *D;
641 
642   PetscFunctionBegin;
643   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647       Mnz = Ai[m]; // Unz (with the unit diagonal)
648       PetscCall(PetscMalloc1(Mnz, &Ma));
649       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651       for (PetscInt i = 0; i < m; i++) {
652         PetscInt ulen = Ai[i + 1] - Ai[i];
653         Mj[Ai[i]]     = i;                                              // diagonal entry
654         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655       }
656       // Copy M (U) from host to device
657       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663 
664       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672 
673       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676 
677       // Allocate work vectors in SpSv
678       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
680 
681       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683 
684       // Query buffer sizes for SpSV and then allocate buffers
685       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688 
689       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut sovle uses the same matrix (spMatDescr_U), but different descr and buffer
690       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692 
693       // Record for reuse
694       fs->csrVal_h = Ma;
695       fs->diag_h   = D;
696       PetscCall(PetscFree(Mj));
697     }
698     // Copy the value
699     Ma  = fs->csrVal_h;
700     D   = fs->diag_h;
701     Mnz = Ai[m];
702     for (PetscInt i = 0; i < m; i++) {
703       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706     }
707     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709 
710     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 // Solve Ut D U x = b
718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719 {
720   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722   const PetscScalar                    *barray;
723   PetscScalar                          *xarray;
724   thrust::device_ptr<const PetscScalar> bGPU;
725   thrust::device_ptr<PetscScalar>       xGPU;
726   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727   PetscInt                              m   = A->rmap->n;
728 
729   PetscFunctionBegin;
730   PetscCall(PetscLogGpuTimeBegin());
731   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732   PetscCall(VecCUDAGetArrayRead(b, &barray));
733   xGPU = thrust::device_pointer_cast(xarray);
734   bGPU = thrust::device_pointer_cast(barray);
735 
736   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737   if (fs->rpermIndices) {
738     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740   } else {
741     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742   }
743 
744   // Solve Ut Y = X
745   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747 
748   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749   // It is basically a vector element-wise multiplication, but cublas does not have it!
750   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   /* Factorize fact inplace */
1719   if (m)
1720     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722   if (PetscDefined(USE_DEBUG)) {
1723     int              numerical_zero;
1724     cusparseStatus_t status;
1725     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727   }
1728 
1729   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731   */
1732   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1733 
1734   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1735 
1736   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1738 
1739   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742   fact->ops->matsolve          = NULL;
1743   fact->ops->matsolvetranspose = NULL;
1744   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745   PetscFunctionReturn(PETSC_SUCCESS);
1746 }
1747 
1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749 {
1750   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752   PetscInt                      m, nz;
1753 
1754   PetscFunctionBegin;
1755   if (PetscDefined(USE_DEBUG)) {
1756     PetscInt  i;
1757     PetscBool flg, missing;
1758 
1759     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764   }
1765 
1766   /* Free the old stale stuff */
1767   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1768 
1769   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770      but they will not be used. Allocate them just for easy debugging.
1771    */
1772   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1773 
1774   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775   fact->factortype             = MAT_FACTOR_ILU;
1776   fact->info.factor_mallocs    = 0;
1777   fact->info.fill_ratio_given  = info->fill;
1778   fact->info.fill_ratio_needed = 1.0;
1779 
1780   aij->row = NULL;
1781   aij->col = NULL;
1782 
1783   /* ====================================================================== */
1784   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785   /* We'll do in-place factorization on fact                                */
1786   /* ====================================================================== */
1787   const int *Ai, *Aj;
1788 
1789   m  = fact->rmap->n;
1790   nz = aij->nz;
1791 
1792   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1798 
1799   /* ====================================================================== */
1800   /* Create descriptors for M, L, U                                         */
1801   /* ====================================================================== */
1802   cusparseFillMode_t fillMode;
1803   cusparseDiagType_t diagType;
1804 
1805   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1808 
1809   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814   */
1815   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1820 
1821   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826 
1827   /* ========================================================================= */
1828   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829   /* ========================================================================= */
1830   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831   if (m)
1832     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1834 
1835   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1837 
1838   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1840 
1841   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1843 
1844   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1846 
1847   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851    */
1852   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854     fs->spsvBuffer_L = fs->factBuffer_M;
1855     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856   } else {
1857     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858     fs->spsvBuffer_U = fs->factBuffer_M;
1859     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860   }
1861 
1862   /* ========================================================================== */
1863   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865   /* ========================================================================== */
1866   int              structural_zero;
1867   cusparseStatus_t status;
1868 
1869   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870   if (m)
1871     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873   if (PetscDefined(USE_DEBUG)) {
1874     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877   }
1878 
1879   /* Estimate FLOPs of the numeric factorization */
1880   {
1881     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1882     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883     PetscLogDouble flops = 0.0;
1884 
1885     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886     Ai    = Aseq->i;
1887     Adiag = Aseq->diag;
1888     for (PetscInt i = 0; i < m; i++) {
1889       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890         nzRow  = Ai[i + 1] - Ai[i];
1891         nzLeft = Adiag[i] - Ai[i];
1892         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894         */
1895         nzLeft = (nzRow - 1) / 2;
1896         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897       }
1898     }
1899     fs->numericFactFlops = flops;
1900   }
1901   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902   PetscFunctionReturn(PETSC_SUCCESS);
1903 }
1904 
1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906 {
1907   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909   const PetscScalar            *barray;
1910   PetscScalar                  *xarray;
1911 
1912   PetscFunctionBegin;
1913   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915   PetscCall(PetscLogGpuTimeBegin());
1916 
1917   /* Solve L*y = b */
1918   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1922 
1923   /* Solve Lt*x = y */
1924   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1927 
1928   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1930 
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937 {
1938   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941   CsrMatrix                    *Acsr;
1942   PetscInt                      m, nz;
1943   PetscBool                     flg;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949   }
1950 
1951   /* Copy A's value to fact */
1952   m  = fact->rmap->n;
1953   nz = aij->nz;
1954   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1957 
1958   /* Factorize fact inplace */
1959   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964    */
1965   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966   if (PetscDefined(USE_DEBUG)) {
1967     int              numerical_zero;
1968     cusparseStatus_t status;
1969     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971   }
1972 
1973   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1974 
1975   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977   */
1978   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1979 
1980   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983   fact->ops->matsolve          = NULL;
1984   fact->ops->matsolvetranspose = NULL;
1985   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990 {
1991   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993   PetscInt                      m, nz;
1994 
1995   PetscFunctionBegin;
1996   if (PetscDefined(USE_DEBUG)) {
1997     PetscInt  i;
1998     PetscBool flg, missing;
1999 
2000     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005   }
2006 
2007   /* Free the old stale stuff */
2008   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2009 
2010   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011      but they will not be used. Allocate them just for easy debugging.
2012    */
2013   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2014 
2015   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016   fact->factortype             = MAT_FACTOR_ICC;
2017   fact->info.factor_mallocs    = 0;
2018   fact->info.fill_ratio_given  = info->fill;
2019   fact->info.fill_ratio_needed = 1.0;
2020 
2021   aij->row = NULL;
2022   aij->col = NULL;
2023 
2024   /* ====================================================================== */
2025   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026   /* We'll do in-place factorization on fact                                */
2027   /* ====================================================================== */
2028   const int *Ai, *Aj;
2029 
2030   m  = fact->rmap->n;
2031   nz = aij->nz;
2032 
2033   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2039 
2040   /* ====================================================================== */
2041   /* Create mat descriptors for M, L                                        */
2042   /* ====================================================================== */
2043   cusparseFillMode_t fillMode;
2044   cusparseDiagType_t diagType;
2045 
2046   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2049 
2050   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055   */
2056   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2061 
2062   /* ========================================================================= */
2063   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064   /* ========================================================================= */
2065   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2067 
2068   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2070 
2071   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2073 
2074   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2076 
2077   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2079 
2080   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082    */
2083   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085     fs->spsvBuffer_L = fs->factBuffer_M;
2086     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087   } else {
2088     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089     fs->spsvBuffer_Lt = fs->factBuffer_M;
2090     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091   }
2092 
2093   /* ========================================================================== */
2094   /* Perform analysis of ic0 on M                                               */
2095   /* The lower triangular part of M has the same sparsity pattern as L          */
2096   /* ========================================================================== */
2097   int              structural_zero;
2098   cusparseStatus_t status;
2099 
2100   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102   if (PetscDefined(USE_DEBUG)) {
2103     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106   }
2107 
2108   /* Estimate FLOPs of the numeric factorization */
2109   {
2110     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2111     PetscInt      *Ai, nzRow, nzLeft;
2112     PetscLogDouble flops = 0.0;
2113 
2114     Ai = Aseq->i;
2115     for (PetscInt i = 0; i < m; i++) {
2116       nzRow = Ai[i + 1] - Ai[i];
2117       if (nzRow > 1) {
2118         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120         */
2121         nzLeft = (nzRow - 1) / 2;
2122         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123       }
2124     }
2125     fs->numericFactFlops = flops;
2126   }
2127   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 #endif
2131 
2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133 {
2134   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(B->spptr);
2135 
2136   PetscFunctionBegin;
2137   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2138   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2139   B->offloadmask = PETSC_OFFLOAD_CPU;
2140 
2141   if (!cusparsestruct->use_cpu_solve) {
2142 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
2143     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2144     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2145 #else
2146     /* determine which version of MatSolve needs to be used. */
2147     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2148     IS          isrow = b->row, iscol = b->col;
2149     PetscBool   row_identity, col_identity;
2150 
2151     PetscCall(ISIdentity(isrow, &row_identity));
2152     PetscCall(ISIdentity(iscol, &col_identity));
2153     if (row_identity && col_identity) {
2154       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2155       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2156     } else {
2157       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2158       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2159     }
2160 #endif
2161   }
2162   B->ops->matsolve          = NULL;
2163   B->ops->matsolvetranspose = NULL;
2164 
2165   /* get the triangular factors */
2166   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2167   PetscFunctionReturn(PETSC_SUCCESS);
2168 }
2169 
2170 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2171 {
2172   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2173 
2174   PetscFunctionBegin;
2175   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2176   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2177   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2178   PetscFunctionReturn(PETSC_SUCCESS);
2179 }
2180 
2181 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2182 {
2183   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2184 
2185   PetscFunctionBegin;
2186 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
2187   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2188   if (cusparseTriFactors->factorizeOnDevice) {
2189     PetscCall(ISIdentity(isrow, &row_identity));
2190     PetscCall(ISIdentity(iscol, &col_identity));
2191   }
2192   if (!info->levels && row_identity && col_identity) {
2193     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2194   } else
2195 #endif
2196   {
2197     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2198     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2199     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2200   }
2201   PetscFunctionReturn(PETSC_SUCCESS);
2202 }
2203 
2204 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2205 {
2206   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2207 
2208   PetscFunctionBegin;
2209 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
2210   PetscBool perm_identity = PETSC_FALSE;
2211   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2212   if (!info->levels && perm_identity) {
2213     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2214   } else
2215 #endif
2216   {
2217     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2218     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2219     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2220   }
2221   PetscFunctionReturn(PETSC_SUCCESS);
2222 }
2223 
2224 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2225 {
2226   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2227 
2228   PetscFunctionBegin;
2229   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2230   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2232   PetscFunctionReturn(PETSC_SUCCESS);
2233 }
2234 
2235 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2236 {
2237   PetscFunctionBegin;
2238   *type = MATSOLVERCUSPARSE;
2239   PetscFunctionReturn(PETSC_SUCCESS);
2240 }
2241 
2242 /*MC
2243   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2244   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2245   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2246   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2247   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2248   algorithms are not recommended. This class does NOT support direct solver operations.
2249 
2250   Level: beginner
2251 
2252 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2253           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2254 M*/
2255 
2256 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2257 {
2258   PetscInt  n = A->rmap->n;
2259   PetscBool factOnDevice, factOnHost;
2260   char     *prefix;
2261   char      factPlace[32] = "device"; /* the default */
2262 
2263   PetscFunctionBegin;
2264   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2265   PetscCall(MatSetSizes(*B, n, n, n, n));
2266   (*B)->factortype = ftype;
2267   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2268 
2269   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2270   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2271   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2272   PetscOptionsEnd();
2273   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2274   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2275   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2276   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2277 
2278   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2279   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2280     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2281     if (!A->boundtocpu) {
2282       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2283       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2284     } else {
2285       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2286       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2287     }
2288     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2289     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2290     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2291   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2292     if (!A->boundtocpu) {
2293       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2294       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2295     } else {
2296       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2297       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2298     }
2299     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2300     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2301   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2302 
2303   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2304   (*B)->canuseordering = PETSC_TRUE;
2305   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2306   PetscFunctionReturn(PETSC_SUCCESS);
2307 }
2308 
2309 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2310 {
2311   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2312   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2313 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
2314   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2315 #endif
2316 
2317   PetscFunctionBegin;
2318   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2319     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2320     if (A->factortype == MAT_FACTOR_NONE) {
2321       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2322       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2323     }
2324 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
2325     else if (fs->csrVal) {
2326       /* We have a factorized matrix on device and are able to copy it to host */
2327       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2328     }
2329 #endif
2330     else
2331       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2332     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2333     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2334     A->offloadmask = PETSC_OFFLOAD_BOTH;
2335   }
2336   PetscFunctionReturn(PETSC_SUCCESS);
2337 }
2338 
2339 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2340 {
2341   PetscFunctionBegin;
2342   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2343   *array = ((Mat_SeqAIJ *)A->data)->a;
2344   PetscFunctionReturn(PETSC_SUCCESS);
2345 }
2346 
2347 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2348 {
2349   PetscFunctionBegin;
2350   A->offloadmask = PETSC_OFFLOAD_CPU;
2351   *array         = NULL;
2352   PetscFunctionReturn(PETSC_SUCCESS);
2353 }
2354 
2355 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2356 {
2357   PetscFunctionBegin;
2358   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2359   *array = ((Mat_SeqAIJ *)A->data)->a;
2360   PetscFunctionReturn(PETSC_SUCCESS);
2361 }
2362 
2363 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2364 {
2365   PetscFunctionBegin;
2366   *array = NULL;
2367   PetscFunctionReturn(PETSC_SUCCESS);
2368 }
2369 
2370 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2371 {
2372   PetscFunctionBegin;
2373   *array = ((Mat_SeqAIJ *)A->data)->a;
2374   PetscFunctionReturn(PETSC_SUCCESS);
2375 }
2376 
2377 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2378 {
2379   PetscFunctionBegin;
2380   A->offloadmask = PETSC_OFFLOAD_CPU;
2381   *array         = NULL;
2382   PetscFunctionReturn(PETSC_SUCCESS);
2383 }
2384 
2385 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2386 {
2387   Mat_SeqAIJCUSPARSE *cusp;
2388   CsrMatrix          *matrix;
2389 
2390   PetscFunctionBegin;
2391   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2392   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2393   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2394   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2395   matrix = (CsrMatrix *)cusp->mat->mat;
2396 
2397   if (i) {
2398 #if !defined(PETSC_USE_64BIT_INDICES)
2399     *i = matrix->row_offsets->data().get();
2400 #else
2401     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2402 #endif
2403   }
2404   if (j) {
2405 #if !defined(PETSC_USE_64BIT_INDICES)
2406     *j = matrix->column_indices->data().get();
2407 #else
2408     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2409 #endif
2410   }
2411   if (a) *a = matrix->values->data().get();
2412   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2413   PetscFunctionReturn(PETSC_SUCCESS);
2414 }
2415 
2416 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2417 {
2418   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2419   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2420   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2421   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2422   cusparseStatus_t              stat;
2423   PetscBool                     both = PETSC_TRUE;
2424 
2425   PetscFunctionBegin;
2426   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2427   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2428     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2429       CsrMatrix *matrix;
2430       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2431 
2432       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2433       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2434       matrix->values->assign(a->a, a->a + a->nz);
2435       PetscCallCUDA(WaitForCUDA());
2436       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2437       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2438       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2439     } else {
2440       PetscInt nnz;
2441       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2442       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2443       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2444       delete cusparsestruct->workVector;
2445       delete cusparsestruct->rowoffsets_gpu;
2446       cusparsestruct->workVector     = NULL;
2447       cusparsestruct->rowoffsets_gpu = NULL;
2448       try {
2449         if (a->compressedrow.use) {
2450           m    = a->compressedrow.nrows;
2451           ii   = a->compressedrow.i;
2452           ridx = a->compressedrow.rindex;
2453         } else {
2454           m    = A->rmap->n;
2455           ii   = a->i;
2456           ridx = NULL;
2457         }
2458         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2459         if (!a->a) {
2460           nnz  = ii[m];
2461           both = PETSC_FALSE;
2462         } else nnz = a->nz;
2463         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2464 
2465         /* create cusparse matrix */
2466         cusparsestruct->nrows = m;
2467         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2468         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2469         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2470         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2471 
2472         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2473         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2474         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2475         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2476         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2479 
2480         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2481         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2482           /* set the matrix */
2483           CsrMatrix *mat   = new CsrMatrix;
2484           mat->num_rows    = m;
2485           mat->num_cols    = A->cmap->n;
2486           mat->num_entries = nnz;
2487           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2488           mat->row_offsets->assign(ii, ii + m + 1);
2489 
2490           mat->column_indices = new THRUSTINTARRAY32(nnz);
2491           mat->column_indices->assign(a->j, a->j + nnz);
2492 
2493           mat->values = new THRUSTARRAY(nnz);
2494           if (a->a) mat->values->assign(a->a, a->a + nnz);
2495 
2496           /* assign the pointer */
2497           matstruct->mat = mat;
2498 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2499           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2500             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2501                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2502             PetscCallCUSPARSE(stat);
2503           }
2504 #endif
2505         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2506 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2507           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2508 #else
2509           CsrMatrix *mat   = new CsrMatrix;
2510           mat->num_rows    = m;
2511           mat->num_cols    = A->cmap->n;
2512           mat->num_entries = nnz;
2513           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2514           mat->row_offsets->assign(ii, ii + m + 1);
2515 
2516           mat->column_indices = new THRUSTINTARRAY32(nnz);
2517           mat->column_indices->assign(a->j, a->j + nnz);
2518 
2519           mat->values = new THRUSTARRAY(nnz);
2520           if (a->a) mat->values->assign(a->a, a->a + nnz);
2521 
2522           cusparseHybMat_t hybMat;
2523           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2524           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2525           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2526           PetscCallCUSPARSE(stat);
2527           /* assign the pointer */
2528           matstruct->mat = hybMat;
2529 
2530           if (mat) {
2531             if (mat->values) delete (THRUSTARRAY *)mat->values;
2532             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2533             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2534             delete (CsrMatrix *)mat;
2535           }
2536 #endif
2537         }
2538 
2539         /* assign the compressed row indices */
2540         if (a->compressedrow.use) {
2541           cusparsestruct->workVector = new THRUSTARRAY(m);
2542           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2543           matstruct->cprowIndices->assign(ridx, ridx + m);
2544           tmp = m;
2545         } else {
2546           cusparsestruct->workVector = NULL;
2547           matstruct->cprowIndices    = NULL;
2548           tmp                        = 0;
2549         }
2550         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2551 
2552         /* assign the pointer */
2553         cusparsestruct->mat = matstruct;
2554       } catch (char *ex) {
2555         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2556       }
2557       PetscCallCUDA(WaitForCUDA());
2558       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2559       cusparsestruct->nonzerostate = A->nonzerostate;
2560     }
2561     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2562   }
2563   PetscFunctionReturn(PETSC_SUCCESS);
2564 }
2565 
2566 struct VecCUDAPlusEquals {
2567   template <typename Tuple>
2568   __host__ __device__ void operator()(Tuple t)
2569   {
2570     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2571   }
2572 };
2573 
2574 struct VecCUDAEquals {
2575   template <typename Tuple>
2576   __host__ __device__ void operator()(Tuple t)
2577   {
2578     thrust::get<1>(t) = thrust::get<0>(t);
2579   }
2580 };
2581 
2582 struct VecCUDAEqualsReverse {
2583   template <typename Tuple>
2584   __host__ __device__ void operator()(Tuple t)
2585   {
2586     thrust::get<0>(t) = thrust::get<1>(t);
2587   }
2588 };
2589 
2590 struct MatMatCusparse {
2591   PetscBool      cisdense;
2592   PetscScalar   *Bt;
2593   Mat            X;
2594   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2595   PetscLogDouble flops;
2596   CsrMatrix     *Bcsr;
2597 
2598 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2599   cusparseSpMatDescr_t matSpBDescr;
2600   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2601   cusparseDnMatDescr_t matBDescr;
2602   cusparseDnMatDescr_t matCDescr;
2603   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2604   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2605   void *dBuffer4;
2606   void *dBuffer5;
2607   #endif
2608   size_t                mmBufferSize;
2609   void                 *mmBuffer;
2610   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2611   cusparseSpGEMMDescr_t spgemmDesc;
2612 #endif
2613 };
2614 
2615 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2616 {
2617   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2618 
2619   PetscFunctionBegin;
2620   PetscCallCUDA(cudaFree(mmdata->Bt));
2621   delete mmdata->Bcsr;
2622 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2623   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2624   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2625   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2626   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2627   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2628   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2629   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2630   #endif
2631   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2632   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2633 #endif
2634   PetscCall(MatDestroy(&mmdata->X));
2635   PetscCall(PetscFree(data));
2636   PetscFunctionReturn(PETSC_SUCCESS);
2637 }
2638 
2639 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2640 
2641 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2642 {
2643   Mat_Product                  *product = C->product;
2644   Mat                           A, B;
2645   PetscInt                      m, n, blda, clda;
2646   PetscBool                     flg, biscuda;
2647   Mat_SeqAIJCUSPARSE           *cusp;
2648   cusparseStatus_t              stat;
2649   cusparseOperation_t           opA;
2650   const PetscScalar            *barray;
2651   PetscScalar                  *carray;
2652   MatMatCusparse               *mmdata;
2653   Mat_SeqAIJCUSPARSEMultStruct *mat;
2654   CsrMatrix                    *csrmat;
2655 
2656   PetscFunctionBegin;
2657   MatCheckProduct(C, 1);
2658   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2659   mmdata = (MatMatCusparse *)product->data;
2660   A      = product->A;
2661   B      = product->B;
2662   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2663   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2664   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2665      Instead of silently accepting the wrong answer, I prefer to raise the error */
2666   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2667   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2668   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2669   switch (product->type) {
2670   case MATPRODUCT_AB:
2671   case MATPRODUCT_PtAP:
2672     mat = cusp->mat;
2673     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2674     m   = A->rmap->n;
2675     n   = B->cmap->n;
2676     break;
2677   case MATPRODUCT_AtB:
2678     if (!A->form_explicit_transpose) {
2679       mat = cusp->mat;
2680       opA = CUSPARSE_OPERATION_TRANSPOSE;
2681     } else {
2682       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2683       mat = cusp->matTranspose;
2684       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2685     }
2686     m = A->cmap->n;
2687     n = B->cmap->n;
2688     break;
2689   case MATPRODUCT_ABt:
2690   case MATPRODUCT_RARt:
2691     mat = cusp->mat;
2692     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2693     m   = A->rmap->n;
2694     n   = B->rmap->n;
2695     break;
2696   default:
2697     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2698   }
2699   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2700   csrmat = (CsrMatrix *)mat->mat;
2701   /* if the user passed a CPU matrix, copy the data to the GPU */
2702   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2703   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2704   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2705 
2706   PetscCall(MatDenseGetLDA(B, &blda));
2707   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2708     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2709     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2710   } else {
2711     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2712     PetscCall(MatDenseGetLDA(C, &clda));
2713   }
2714 
2715   PetscCall(PetscLogGpuTimeBegin());
2716 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2717   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2718   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2719   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2720     size_t mmBufferSize;
2721     if (mmdata->initialized && mmdata->Blda != blda) {
2722       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2723       mmdata->matBDescr = NULL;
2724     }
2725     if (!mmdata->matBDescr) {
2726       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2727       mmdata->Blda = blda;
2728     }
2729 
2730     if (mmdata->initialized && mmdata->Clda != clda) {
2731       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2732       mmdata->matCDescr = NULL;
2733     }
2734     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2735       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2736       mmdata->Clda = clda;
2737     }
2738 
2739     if (!mat->matDescr) {
2740       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2741                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2742       PetscCallCUSPARSE(stat);
2743     }
2744     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2745     PetscCallCUSPARSE(stat);
2746     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2747       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2748       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2749       mmdata->mmBufferSize = mmBufferSize;
2750     }
2751     mmdata->initialized = PETSC_TRUE;
2752   } else {
2753     /* to be safe, always update pointers of the mats */
2754     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2755     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2756     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2757   }
2758 
2759   /* do cusparseSpMM, which supports transpose on B */
2760   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2761   PetscCallCUSPARSE(stat);
2762 #else
2763   PetscInt k;
2764   /* cusparseXcsrmm does not support transpose on B */
2765   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2766     cublasHandle_t cublasv2handle;
2767     cublasStatus_t cerr;
2768 
2769     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2770     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2771     PetscCallCUBLAS(cerr);
2772     blda = B->cmap->n;
2773     k    = B->cmap->n;
2774   } else {
2775     k = B->rmap->n;
2776   }
2777 
2778   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2779   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2780   PetscCallCUSPARSE(stat);
2781 #endif
2782   PetscCall(PetscLogGpuTimeEnd());
2783   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2784   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2785   if (product->type == MATPRODUCT_RARt) {
2786     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2787     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2788   } else if (product->type == MATPRODUCT_PtAP) {
2789     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2790     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2791   } else {
2792     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2793   }
2794   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2795   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2796   PetscFunctionReturn(PETSC_SUCCESS);
2797 }
2798 
2799 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2800 {
2801   Mat_Product        *product = C->product;
2802   Mat                 A, B;
2803   PetscInt            m, n;
2804   PetscBool           cisdense, flg;
2805   MatMatCusparse     *mmdata;
2806   Mat_SeqAIJCUSPARSE *cusp;
2807 
2808   PetscFunctionBegin;
2809   MatCheckProduct(C, 1);
2810   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2811   A = product->A;
2812   B = product->B;
2813   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2814   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2815   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2816   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2817   switch (product->type) {
2818   case MATPRODUCT_AB:
2819     m = A->rmap->n;
2820     n = B->cmap->n;
2821     break;
2822   case MATPRODUCT_AtB:
2823     m = A->cmap->n;
2824     n = B->cmap->n;
2825     break;
2826   case MATPRODUCT_ABt:
2827     m = A->rmap->n;
2828     n = B->rmap->n;
2829     break;
2830   case MATPRODUCT_PtAP:
2831     m = B->cmap->n;
2832     n = B->cmap->n;
2833     break;
2834   case MATPRODUCT_RARt:
2835     m = B->rmap->n;
2836     n = B->rmap->n;
2837     break;
2838   default:
2839     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2840   }
2841   PetscCall(MatSetSizes(C, m, n, m, n));
2842   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2843   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2844   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2845 
2846   /* product data */
2847   PetscCall(PetscNew(&mmdata));
2848   mmdata->cisdense = cisdense;
2849 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2850   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2851   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2852 #endif
2853   /* for these products we need intermediate storage */
2854   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2855     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2856     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2857     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2858       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2859     } else {
2860       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2861     }
2862   }
2863   C->product->data    = mmdata;
2864   C->product->destroy = MatDestroy_MatMatCusparse;
2865 
2866   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2867   PetscFunctionReturn(PETSC_SUCCESS);
2868 }
2869 
2870 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2871 {
2872   Mat_Product                  *product = C->product;
2873   Mat                           A, B;
2874   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2875   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2876   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2877   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2878   PetscBool                     flg;
2879   cusparseStatus_t              stat;
2880   MatProductType                ptype;
2881   MatMatCusparse               *mmdata;
2882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2883   cusparseSpMatDescr_t BmatSpDescr;
2884 #endif
2885   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2886 
2887   PetscFunctionBegin;
2888   MatCheckProduct(C, 1);
2889   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2890   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2891   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2892   mmdata = (MatMatCusparse *)C->product->data;
2893   A      = product->A;
2894   B      = product->B;
2895   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2896     mmdata->reusesym = PETSC_FALSE;
2897     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2898     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2899     Cmat = Ccusp->mat;
2900     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2901     Ccsr = (CsrMatrix *)Cmat->mat;
2902     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2903     goto finalize;
2904   }
2905   if (!c->nz) goto finalize;
2906   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2907   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2908   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2909   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2910   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2911   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2913   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2914   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2915   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2916   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2919   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2920 
2921   ptype = product->type;
2922   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2923     ptype = MATPRODUCT_AB;
2924     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2925   }
2926   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2927     ptype = MATPRODUCT_AB;
2928     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2929   }
2930   switch (ptype) {
2931   case MATPRODUCT_AB:
2932     Amat = Acusp->mat;
2933     Bmat = Bcusp->mat;
2934     break;
2935   case MATPRODUCT_AtB:
2936     Amat = Acusp->matTranspose;
2937     Bmat = Bcusp->mat;
2938     break;
2939   case MATPRODUCT_ABt:
2940     Amat = Acusp->mat;
2941     Bmat = Bcusp->matTranspose;
2942     break;
2943   default:
2944     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2945   }
2946   Cmat = Ccusp->mat;
2947   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2948   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2949   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2950   Acsr = (CsrMatrix *)Amat->mat;
2951   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2952   Ccsr = (CsrMatrix *)Cmat->mat;
2953   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2954   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2955   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2956   PetscCall(PetscLogGpuTimeBegin());
2957 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2958   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2959   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2960   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2961   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2962   PetscCallCUSPARSE(stat);
2963   #else
2964   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2965   PetscCallCUSPARSE(stat);
2966   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2967   PetscCallCUSPARSE(stat);
2968   #endif
2969 #else
2970   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2971                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2972   PetscCallCUSPARSE(stat);
2973 #endif
2974   PetscCall(PetscLogGpuFlops(mmdata->flops));
2975   PetscCallCUDA(WaitForCUDA());
2976   PetscCall(PetscLogGpuTimeEnd());
2977   C->offloadmask = PETSC_OFFLOAD_GPU;
2978 finalize:
2979   /* shorter version of MatAssemblyEnd_SeqAIJ */
2980   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2981   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2982   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2983   c->reallocs = 0;
2984   C->info.mallocs += 0;
2985   C->info.nz_unneeded = 0;
2986   C->assembled = C->was_assembled = PETSC_TRUE;
2987   C->num_ass++;
2988   PetscFunctionReturn(PETSC_SUCCESS);
2989 }
2990 
2991 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2992 {
2993   Mat_Product                  *product = C->product;
2994   Mat                           A, B;
2995   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2996   Mat_SeqAIJ                   *a, *b, *c;
2997   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2998   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2999   PetscInt                      i, j, m, n, k;
3000   PetscBool                     flg;
3001   cusparseStatus_t              stat;
3002   MatProductType                ptype;
3003   MatMatCusparse               *mmdata;
3004   PetscLogDouble                flops;
3005   PetscBool                     biscompressed, ciscompressed;
3006 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3007   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3008   cusparseSpMatDescr_t BmatSpDescr;
3009 #else
3010   int cnz;
3011 #endif
3012   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3013 
3014   PetscFunctionBegin;
3015   MatCheckProduct(C, 1);
3016   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3017   A = product->A;
3018   B = product->B;
3019   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3020   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3021   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3022   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3023   a = (Mat_SeqAIJ *)A->data;
3024   b = (Mat_SeqAIJ *)B->data;
3025   /* product data */
3026   PetscCall(PetscNew(&mmdata));
3027   C->product->data    = mmdata;
3028   C->product->destroy = MatDestroy_MatMatCusparse;
3029 
3030   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3031   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3032   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3033   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3034   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3035   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036 
3037   ptype = product->type;
3038   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3039     ptype                                          = MATPRODUCT_AB;
3040     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3041   }
3042   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3043     ptype                                          = MATPRODUCT_AB;
3044     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3045   }
3046   biscompressed = PETSC_FALSE;
3047   ciscompressed = PETSC_FALSE;
3048   switch (ptype) {
3049   case MATPRODUCT_AB:
3050     m    = A->rmap->n;
3051     n    = B->cmap->n;
3052     k    = A->cmap->n;
3053     Amat = Acusp->mat;
3054     Bmat = Bcusp->mat;
3055     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3056     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3057     break;
3058   case MATPRODUCT_AtB:
3059     m = A->cmap->n;
3060     n = B->cmap->n;
3061     k = A->rmap->n;
3062     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3063     Amat = Acusp->matTranspose;
3064     Bmat = Bcusp->mat;
3065     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3066     break;
3067   case MATPRODUCT_ABt:
3068     m = A->rmap->n;
3069     n = B->rmap->n;
3070     k = A->cmap->n;
3071     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3072     Amat = Acusp->mat;
3073     Bmat = Bcusp->matTranspose;
3074     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3075     break;
3076   default:
3077     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3078   }
3079 
3080   /* create cusparse matrix */
3081   PetscCall(MatSetSizes(C, m, n, m, n));
3082   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3083   c     = (Mat_SeqAIJ *)C->data;
3084   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3085   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3086   Ccsr  = new CsrMatrix;
3087 
3088   c->compressedrow.use = ciscompressed;
3089   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3090     c->compressedrow.nrows = a->compressedrow.nrows;
3091     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3092     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3093     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3094     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3095     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3096   } else {
3097     c->compressedrow.nrows  = 0;
3098     c->compressedrow.i      = NULL;
3099     c->compressedrow.rindex = NULL;
3100     Ccusp->workVector       = NULL;
3101     Cmat->cprowIndices      = NULL;
3102   }
3103   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3104   Ccusp->mat        = Cmat;
3105   Ccusp->mat->mat   = Ccsr;
3106   Ccsr->num_rows    = Ccusp->nrows;
3107   Ccsr->num_cols    = n;
3108   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3109   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3110   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3111   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3112   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3113   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3114   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3115   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3116   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3119     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3120     c->nz                = 0;
3121     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3122     Ccsr->values         = new THRUSTARRAY(c->nz);
3123     goto finalizesym;
3124   }
3125 
3126   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3127   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3128   Acsr = (CsrMatrix *)Amat->mat;
3129   if (!biscompressed) {
3130     Bcsr = (CsrMatrix *)Bmat->mat;
3131 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3132     BmatSpDescr = Bmat->matDescr;
3133 #endif
3134   } else { /* we need to use row offsets for the full matrix */
3135     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3136     Bcsr                 = new CsrMatrix;
3137     Bcsr->num_rows       = B->rmap->n;
3138     Bcsr->num_cols       = cBcsr->num_cols;
3139     Bcsr->num_entries    = cBcsr->num_entries;
3140     Bcsr->column_indices = cBcsr->column_indices;
3141     Bcsr->values         = cBcsr->values;
3142     if (!Bcusp->rowoffsets_gpu) {
3143       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3144       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3145       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3146     }
3147     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3148     mmdata->Bcsr      = Bcsr;
3149 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3150     if (Bcsr->num_rows && Bcsr->num_cols) {
3151       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3152       PetscCallCUSPARSE(stat);
3153     }
3154     BmatSpDescr = mmdata->matSpBDescr;
3155 #endif
3156   }
3157   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3158   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3159   /* precompute flops count */
3160   if (ptype == MATPRODUCT_AB) {
3161     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3162       const PetscInt st = a->i[i];
3163       const PetscInt en = a->i[i + 1];
3164       for (j = st; j < en; j++) {
3165         const PetscInt brow = a->j[j];
3166         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3167       }
3168     }
3169   } else if (ptype == MATPRODUCT_AtB) {
3170     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3171       const PetscInt anzi = a->i[i + 1] - a->i[i];
3172       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3173       flops += (2. * anzi) * bnzi;
3174     }
3175   } else { /* TODO */
3176     flops = 0.;
3177   }
3178 
3179   mmdata->flops = flops;
3180   PetscCall(PetscLogGpuTimeBegin());
3181 
3182 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3184   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3185   PetscCallCUSPARSE(stat);
3186   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3187   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3188   {
3189     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3190      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3191   */
3192     void *dBuffer1 = NULL;
3193     void *dBuffer2 = NULL;
3194     void *dBuffer3 = NULL;
3195     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3196     size_t bufferSize1 = 0;
3197     size_t bufferSize2 = 0;
3198     size_t bufferSize3 = 0;
3199     size_t bufferSize4 = 0;
3200     size_t bufferSize5 = 0;
3201 
3202     /* ask bufferSize1 bytes for external memory */
3203     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3204     PetscCallCUSPARSE(stat);
3205     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3206     /* inspect the matrices A and B to understand the memory requirement for the next step */
3207     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3208     PetscCallCUSPARSE(stat);
3209 
3210     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3211     PetscCallCUSPARSE(stat);
3212     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3213     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3214     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3215     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3216     PetscCallCUSPARSE(stat);
3217     PetscCallCUDA(cudaFree(dBuffer1));
3218     PetscCallCUDA(cudaFree(dBuffer2));
3219 
3220     /* get matrix C non-zero entries C_nnz1 */
3221     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3222     c->nz = (PetscInt)C_nnz1;
3223     /* allocate matrix C */
3224     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3225     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3226     Ccsr->values = new THRUSTARRAY(c->nz);
3227     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3228     /* update matC with the new pointers */
3229     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3230     PetscCallCUSPARSE(stat);
3231 
3232     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3233     PetscCallCUSPARSE(stat);
3234     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3235     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3236     PetscCallCUSPARSE(stat);
3237     PetscCallCUDA(cudaFree(dBuffer3));
3238     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3239     PetscCallCUSPARSE(stat);
3240     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3241   }
3242   #else
3243   size_t bufSize2;
3244   /* ask bufferSize bytes for external memory */
3245   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3246   PetscCallCUSPARSE(stat);
3247   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3248   /* inspect the matrices A and B to understand the memory requirement for the next step */
3249   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3250   PetscCallCUSPARSE(stat);
3251   /* ask bufferSize again bytes for external memory */
3252   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3253   PetscCallCUSPARSE(stat);
3254   /* The CUSPARSE documentation is not clear, nor the API
3255      We need both buffers to perform the operations properly!
3256      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3257      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3258      is stored in the descriptor! What a messy API... */
3259   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3260   /* compute the intermediate product of A * B */
3261   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3262   PetscCallCUSPARSE(stat);
3263   /* get matrix C non-zero entries C_nnz1 */
3264   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3265   c->nz = (PetscInt)C_nnz1;
3266   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3267                       mmdata->mmBufferSize / 1024));
3268   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3269   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3270   Ccsr->values = new THRUSTARRAY(c->nz);
3271   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3272   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3273   PetscCallCUSPARSE(stat);
3274   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3275   PetscCallCUSPARSE(stat);
3276   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3277 #else
3278   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3279   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3280                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3281   PetscCallCUSPARSE(stat);
3282   c->nz                = cnz;
3283   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3284   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3285   Ccsr->values = new THRUSTARRAY(c->nz);
3286   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3287 
3288   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3289   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3290      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3291      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3292   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3293                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3294   PetscCallCUSPARSE(stat);
3295 #endif
3296   PetscCall(PetscLogGpuFlops(mmdata->flops));
3297   PetscCall(PetscLogGpuTimeEnd());
3298 finalizesym:
3299   c->singlemalloc = PETSC_FALSE;
3300   c->free_a       = PETSC_TRUE;
3301   c->free_ij      = PETSC_TRUE;
3302   PetscCall(PetscMalloc1(m + 1, &c->i));
3303   PetscCall(PetscMalloc1(c->nz, &c->j));
3304   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3305     PetscInt      *d_i = c->i;
3306     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3307     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3308     ii = *Ccsr->row_offsets;
3309     jj = *Ccsr->column_indices;
3310     if (ciscompressed) d_i = c->compressedrow.i;
3311     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3312     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3313   } else {
3314     PetscInt *d_i = c->i;
3315     if (ciscompressed) d_i = c->compressedrow.i;
3316     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3317     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3318   }
3319   if (ciscompressed) { /* need to expand host row offsets */
3320     PetscInt r = 0;
3321     c->i[0]    = 0;
3322     for (k = 0; k < c->compressedrow.nrows; k++) {
3323       const PetscInt next = c->compressedrow.rindex[k];
3324       const PetscInt old  = c->compressedrow.i[k];
3325       for (; r < next; r++) c->i[r + 1] = old;
3326     }
3327     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3328   }
3329   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3330   PetscCall(PetscMalloc1(m, &c->ilen));
3331   PetscCall(PetscMalloc1(m, &c->imax));
3332   c->maxnz         = c->nz;
3333   c->nonzerorowcnt = 0;
3334   c->rmax          = 0;
3335   for (k = 0; k < m; k++) {
3336     const PetscInt nn = c->i[k + 1] - c->i[k];
3337     c->ilen[k] = c->imax[k] = nn;
3338     c->nonzerorowcnt += (PetscInt) !!nn;
3339     c->rmax = PetscMax(c->rmax, nn);
3340   }
3341   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3342   PetscCall(PetscMalloc1(c->nz, &c->a));
3343   Ccsr->num_entries = c->nz;
3344 
3345   C->nonzerostate++;
3346   PetscCall(PetscLayoutSetUp(C->rmap));
3347   PetscCall(PetscLayoutSetUp(C->cmap));
3348   Ccusp->nonzerostate = C->nonzerostate;
3349   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3350   C->preallocated     = PETSC_TRUE;
3351   C->assembled        = PETSC_FALSE;
3352   C->was_assembled    = PETSC_FALSE;
3353   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3354     mmdata->reusesym = PETSC_TRUE;
3355     C->offloadmask   = PETSC_OFFLOAD_GPU;
3356   }
3357   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3358   PetscFunctionReturn(PETSC_SUCCESS);
3359 }
3360 
3361 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3362 
3363 /* handles sparse or dense B */
3364 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3365 {
3366   Mat_Product *product = mat->product;
3367   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3368 
3369   PetscFunctionBegin;
3370   MatCheckProduct(mat, 1);
3371   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3372   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3373   if (product->type == MATPRODUCT_ABC) {
3374     Ciscusp = PETSC_FALSE;
3375     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3376   }
3377   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3378     PetscBool usecpu = PETSC_FALSE;
3379     switch (product->type) {
3380     case MATPRODUCT_AB:
3381       if (product->api_user) {
3382         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3383         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3384         PetscOptionsEnd();
3385       } else {
3386         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3387         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3388         PetscOptionsEnd();
3389       }
3390       break;
3391     case MATPRODUCT_AtB:
3392       if (product->api_user) {
3393         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3394         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3395         PetscOptionsEnd();
3396       } else {
3397         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3398         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3399         PetscOptionsEnd();
3400       }
3401       break;
3402     case MATPRODUCT_PtAP:
3403       if (product->api_user) {
3404         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3405         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3406         PetscOptionsEnd();
3407       } else {
3408         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3409         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3410         PetscOptionsEnd();
3411       }
3412       break;
3413     case MATPRODUCT_RARt:
3414       if (product->api_user) {
3415         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3416         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3417         PetscOptionsEnd();
3418       } else {
3419         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3420         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3421         PetscOptionsEnd();
3422       }
3423       break;
3424     case MATPRODUCT_ABC:
3425       if (product->api_user) {
3426         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3427         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3428         PetscOptionsEnd();
3429       } else {
3430         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3431         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3432         PetscOptionsEnd();
3433       }
3434       break;
3435     default:
3436       break;
3437     }
3438     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3439   }
3440   /* dispatch */
3441   if (isdense) {
3442     switch (product->type) {
3443     case MATPRODUCT_AB:
3444     case MATPRODUCT_AtB:
3445     case MATPRODUCT_ABt:
3446     case MATPRODUCT_PtAP:
3447     case MATPRODUCT_RARt:
3448       if (product->A->boundtocpu) {
3449         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3450       } else {
3451         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3452       }
3453       break;
3454     case MATPRODUCT_ABC:
3455       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3456       break;
3457     default:
3458       break;
3459     }
3460   } else if (Biscusp && Ciscusp) {
3461     switch (product->type) {
3462     case MATPRODUCT_AB:
3463     case MATPRODUCT_AtB:
3464     case MATPRODUCT_ABt:
3465       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3466       break;
3467     case MATPRODUCT_PtAP:
3468     case MATPRODUCT_RARt:
3469     case MATPRODUCT_ABC:
3470       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3471       break;
3472     default:
3473       break;
3474     }
3475   } else { /* fallback for AIJ */
3476     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3477   }
3478   PetscFunctionReturn(PETSC_SUCCESS);
3479 }
3480 
3481 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3482 {
3483   PetscFunctionBegin;
3484   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3485   PetscFunctionReturn(PETSC_SUCCESS);
3486 }
3487 
3488 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3489 {
3490   PetscFunctionBegin;
3491   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3492   PetscFunctionReturn(PETSC_SUCCESS);
3493 }
3494 
3495 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3496 {
3497   PetscFunctionBegin;
3498   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3499   PetscFunctionReturn(PETSC_SUCCESS);
3500 }
3501 
3502 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3503 {
3504   PetscFunctionBegin;
3505   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3506   PetscFunctionReturn(PETSC_SUCCESS);
3507 }
3508 
3509 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510 {
3511   PetscFunctionBegin;
3512   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3513   PetscFunctionReturn(PETSC_SUCCESS);
3514 }
3515 
3516 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3517 {
3518   int i = blockIdx.x * blockDim.x + threadIdx.x;
3519   if (i < n) y[idx[i]] += x[i];
3520 }
3521 
3522 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3523 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3524 {
3525   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3526   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3527   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3528   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3529   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3530   PetscBool                     compressed;
3531 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3532   PetscInt nx, ny;
3533 #endif
3534 
3535   PetscFunctionBegin;
3536   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3537   if (!a->nz) {
3538     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3539     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3540     PetscFunctionReturn(PETSC_SUCCESS);
3541   }
3542   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3543   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3544   if (!trans) {
3545     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3546     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3547   } else {
3548     if (herm || !A->form_explicit_transpose) {
3549       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3550       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3551     } else {
3552       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3553       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3554     }
3555   }
3556   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3557   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3558 
3559   try {
3560     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3561     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3562     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3563 
3564     PetscCall(PetscLogGpuTimeBegin());
3565     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3566       /* z = A x + beta y.
3567          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3568          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3569       */
3570       xptr = xarray;
3571       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3572       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3573 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3574       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3575           allocated to accommodate different uses. So we get the length info directly from mat.
3576        */
3577       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3578         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3579         nx             = mat->num_cols;
3580         ny             = mat->num_rows;
3581       }
3582 #endif
3583     } else {
3584       /* z = A^T x + beta y
3585          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3586          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3587        */
3588       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3589       dptr = zarray;
3590       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3591       if (compressed) { /* Scatter x to work vector */
3592         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3593 
3594         thrust::for_each(
3595 #if PetscDefined(HAVE_THRUST_ASYNC)
3596           thrust::cuda::par.on(PetscDefaultCudaStream),
3597 #endif
3598           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3599           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3600       }
3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3603         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3604         nx             = mat->num_rows;
3605         ny             = mat->num_cols;
3606       }
3607 #endif
3608     }
3609 
3610     /* csr_spmv does y = alpha op(A) x + beta y */
3611     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3612 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3613       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3614       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3615         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3616         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3617         PetscCallCUSPARSE(
3618           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3619         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3620 
3621         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3622       } else {
3623         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3624         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3625         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3626       }
3627 
3628       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3629                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3630 #else
3631       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3633 #endif
3634     } else {
3635       if (cusparsestruct->nrows) {
3636 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3637         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3638 #else
3639         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3640         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3641 #endif
3642       }
3643     }
3644     PetscCall(PetscLogGpuTimeEnd());
3645 
3646     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3647       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3648         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3649           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3650         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3651           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3652         }
3653       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3654         PetscCall(VecSeq_CUDA::Set(zz, 0));
3655       }
3656 
3657       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3658       if (compressed) {
3659         PetscCall(PetscLogGpuTimeBegin());
3660         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3661            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3662            prevent that. So I just add a ScatterAdd kernel.
3663          */
3664 #if 0
3665         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3666         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3667                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3668                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3669                          VecCUDAPlusEquals());
3670 #else
3671         PetscInt n = matstruct->cprowIndices->size();
3672         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3673 #endif
3674         PetscCall(PetscLogGpuTimeEnd());
3675       }
3676     } else {
3677       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3678     }
3679     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3680     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3681     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3682   } catch (char *ex) {
3683     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3684   }
3685   if (yy) {
3686     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3687   } else {
3688     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3689   }
3690   PetscFunctionReturn(PETSC_SUCCESS);
3691 }
3692 
3693 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3694 {
3695   PetscFunctionBegin;
3696   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3697   PetscFunctionReturn(PETSC_SUCCESS);
3698 }
3699 
3700 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3701 {
3702   PetscObjectState    onnz = A->nonzerostate;
3703   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3704 
3705   PetscFunctionBegin;
3706   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3707   if (onnz != A->nonzerostate && cusp->deviceMat) {
3708     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3709     PetscCallCUDA(cudaFree(cusp->deviceMat));
3710     cusp->deviceMat = NULL;
3711   }
3712   PetscFunctionReturn(PETSC_SUCCESS);
3713 }
3714 
3715 /*@
3716    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3717    (the default parallel PETSc format). This matrix will ultimately pushed down
3718    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3719    assembly performance the user should preallocate the matrix storage by setting
3720    the parameter `nz` (or the array `nnz`).
3721 
3722    Collective
3723 
3724    Input Parameters:
3725 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3726 .  m - number of rows
3727 .  n - number of columns
3728 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3729 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3730 
3731    Output Parameter:
3732 .  A - the matrix
3733 
3734    Level: intermediate
3735 
3736    Notes:
3737    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3738    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3739    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3740 
3741    The AIJ format, also called
3742    compressed row storage, is fully compatible with standard Fortran
3743    storage.  That is, the stored row and column indices can begin at
3744    either one (as in Fortran) or zero.
3745 
3746    Specify the preallocated storage with either nz or nnz (not both).
3747    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3748    allocation.
3749 
3750 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3751 @*/
3752 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3753 {
3754   PetscFunctionBegin;
3755   PetscCall(MatCreate(comm, A));
3756   PetscCall(MatSetSizes(*A, m, n, m, n));
3757   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3758   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3759   PetscFunctionReturn(PETSC_SUCCESS);
3760 }
3761 
3762 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3763 {
3764   PetscFunctionBegin;
3765   if (A->factortype == MAT_FACTOR_NONE) {
3766     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3767   } else {
3768     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3769   }
3770   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3771   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3772   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3773   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3774   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3775   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3776   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3777   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3778   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3779   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3780   PetscCall(MatDestroy_SeqAIJ(A));
3781   PetscFunctionReturn(PETSC_SUCCESS);
3782 }
3783 
3784 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3785 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3786 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3787 {
3788   PetscFunctionBegin;
3789   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3790   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3791   PetscFunctionReturn(PETSC_SUCCESS);
3792 }
3793 
3794 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3795 {
3796   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3797   Mat_SeqAIJCUSPARSE *cy;
3798   Mat_SeqAIJCUSPARSE *cx;
3799   PetscScalar        *ay;
3800   const PetscScalar  *ax;
3801   CsrMatrix          *csry, *csrx;
3802 
3803   PetscFunctionBegin;
3804   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3805   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3806   if (X->ops->axpy != Y->ops->axpy) {
3807     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3808     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3809     PetscFunctionReturn(PETSC_SUCCESS);
3810   }
3811   /* if we are here, it means both matrices are bound to GPU */
3812   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3813   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3814   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3815   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3816   csry = (CsrMatrix *)cy->mat->mat;
3817   csrx = (CsrMatrix *)cx->mat->mat;
3818   /* see if we can turn this into a cublas axpy */
3819   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3820     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3821     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3822     if (eq) str = SAME_NONZERO_PATTERN;
3823   }
3824   /* spgeam is buggy with one column */
3825   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3826 
3827   if (str == SUBSET_NONZERO_PATTERN) {
3828     PetscScalar b = 1.0;
3829 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3830     size_t bufferSize;
3831     void  *buffer;
3832 #endif
3833 
3834     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3835     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3836     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3837 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3838     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3839                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3840     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3841     PetscCall(PetscLogGpuTimeBegin());
3842     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3843                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3844     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3845     PetscCall(PetscLogGpuTimeEnd());
3846     PetscCallCUDA(cudaFree(buffer));
3847 #else
3848     PetscCall(PetscLogGpuTimeBegin());
3849     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3850                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3851     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3852     PetscCall(PetscLogGpuTimeEnd());
3853 #endif
3854     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3855     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3856     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3857     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3858   } else if (str == SAME_NONZERO_PATTERN) {
3859     cublasHandle_t cublasv2handle;
3860     PetscBLASInt   one = 1, bnz = 1;
3861 
3862     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3863     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3864     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3865     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3866     PetscCall(PetscLogGpuTimeBegin());
3867     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3868     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3869     PetscCall(PetscLogGpuTimeEnd());
3870     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3871     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3872     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3873   } else {
3874     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3875     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3876   }
3877   PetscFunctionReturn(PETSC_SUCCESS);
3878 }
3879 
3880 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3881 {
3882   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3883   PetscScalar   *ay;
3884   cublasHandle_t cublasv2handle;
3885   PetscBLASInt   one = 1, bnz = 1;
3886 
3887   PetscFunctionBegin;
3888   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3889   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3890   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3891   PetscCall(PetscLogGpuTimeBegin());
3892   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3893   PetscCall(PetscLogGpuFlops(bnz));
3894   PetscCall(PetscLogGpuTimeEnd());
3895   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3896   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3897   PetscFunctionReturn(PETSC_SUCCESS);
3898 }
3899 
3900 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3901 {
3902   PetscBool   both = PETSC_FALSE;
3903   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3904 
3905   PetscFunctionBegin;
3906   if (A->factortype == MAT_FACTOR_NONE) {
3907     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3908     if (spptr->mat) {
3909       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3910       if (matrix->values) {
3911         both = PETSC_TRUE;
3912         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3913       }
3914     }
3915     if (spptr->matTranspose) {
3916       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3917       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3918     }
3919   }
3920   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3921   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3922   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3923   else A->offloadmask = PETSC_OFFLOAD_CPU;
3924   PetscFunctionReturn(PETSC_SUCCESS);
3925 }
3926 
3927 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3928 {
3929   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3930 
3931   PetscFunctionBegin;
3932   if (A->factortype != MAT_FACTOR_NONE) {
3933     A->boundtocpu = flg;
3934     PetscFunctionReturn(PETSC_SUCCESS);
3935   }
3936   if (flg) {
3937     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3938 
3939     A->ops->scale                     = MatScale_SeqAIJ;
3940     A->ops->axpy                      = MatAXPY_SeqAIJ;
3941     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3942     A->ops->mult                      = MatMult_SeqAIJ;
3943     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3944     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3945     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3946     A->ops->multhermitiantranspose    = NULL;
3947     A->ops->multhermitiantransposeadd = NULL;
3948     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3949     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3950     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3951     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3952     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3953     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3954     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3955     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3956   } else {
3957     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3958     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3959     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3960     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3961     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3962     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3963     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3964     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3965     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3966     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3967     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3968     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3969     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3970     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3971     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3972     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3973     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3974 
3975     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3976     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3977     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3978     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3979     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3980     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3981   }
3982   A->boundtocpu = flg;
3983   if (flg && a->inode.size) {
3984     a->inode.use = PETSC_TRUE;
3985   } else {
3986     a->inode.use = PETSC_FALSE;
3987   }
3988   PetscFunctionReturn(PETSC_SUCCESS);
3989 }
3990 
3991 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3992 {
3993   Mat B;
3994 
3995   PetscFunctionBegin;
3996   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3997   if (reuse == MAT_INITIAL_MATRIX) {
3998     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3999   } else if (reuse == MAT_REUSE_MATRIX) {
4000     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4001   }
4002   B = *newmat;
4003 
4004   PetscCall(PetscFree(B->defaultvectype));
4005   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4006 
4007   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4008     if (B->factortype == MAT_FACTOR_NONE) {
4009       Mat_SeqAIJCUSPARSE *spptr;
4010       PetscCall(PetscNew(&spptr));
4011       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4012       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4013       spptr->format = MAT_CUSPARSE_CSR;
4014 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4015   #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
4016       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4017   #else
4018       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4019   #endif
4020       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4021       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4022 #endif
4023       B->spptr = spptr;
4024     } else {
4025       Mat_SeqAIJCUSPARSETriFactors *spptr;
4026 
4027       PetscCall(PetscNew(&spptr));
4028       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4029       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4030       B->spptr = spptr;
4031     }
4032     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4033   }
4034   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4035   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4036   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4037   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4038   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4039   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4040 
4041   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4042   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4043   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4044 #if defined(PETSC_HAVE_HYPRE)
4045   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4046 #endif
4047   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4048   PetscFunctionReturn(PETSC_SUCCESS);
4049 }
4050 
4051 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4052 {
4053   PetscFunctionBegin;
4054   PetscCall(MatCreate_SeqAIJ(B));
4055   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4056   PetscFunctionReturn(PETSC_SUCCESS);
4057 }
4058 
4059 /*MC
4060    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4061 
4062    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4063    CSR, ELL, or Hybrid format.
4064    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4065 
4066    Options Database Keys:
4067 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4068 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4069                                       Other options include ell (ellpack) or hyb (hybrid).
4070 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4071 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4072 
4073   Level: beginner
4074 
4075 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4076 M*/
4077 
4078 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
4079 
4080 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4081 {
4082   PetscFunctionBegin;
4083   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
4084   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4085   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4086   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4087   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4088 
4089   PetscFunctionReturn(PETSC_SUCCESS);
4090 }
4091 
4092 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4093 {
4094   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
4095 
4096   PetscFunctionBegin;
4097   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4098   delete cusp->cooPerm;
4099   delete cusp->cooPerm_a;
4100   cusp->cooPerm   = NULL;
4101   cusp->cooPerm_a = NULL;
4102   if (cusp->use_extended_coo) {
4103     PetscCallCUDA(cudaFree(cusp->jmap_d));
4104     PetscCallCUDA(cudaFree(cusp->perm_d));
4105   }
4106   cusp->use_extended_coo = PETSC_FALSE;
4107   PetscFunctionReturn(PETSC_SUCCESS);
4108 }
4109 
4110 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4111 {
4112   PetscFunctionBegin;
4113   if (*cusparsestruct) {
4114     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
4115     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
4116     delete (*cusparsestruct)->workVector;
4117     delete (*cusparsestruct)->rowoffsets_gpu;
4118     delete (*cusparsestruct)->cooPerm;
4119     delete (*cusparsestruct)->cooPerm_a;
4120     delete (*cusparsestruct)->csr2csc_i;
4121     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4122     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4123     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4124     PetscCall(PetscFree(*cusparsestruct));
4125   }
4126   PetscFunctionReturn(PETSC_SUCCESS);
4127 }
4128 
4129 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4130 {
4131   PetscFunctionBegin;
4132   if (*mat) {
4133     delete (*mat)->values;
4134     delete (*mat)->column_indices;
4135     delete (*mat)->row_offsets;
4136     delete *mat;
4137     *mat = 0;
4138   }
4139   PetscFunctionReturn(PETSC_SUCCESS);
4140 }
4141 
4142 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0)
4143 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4144 {
4145   PetscFunctionBegin;
4146   if (*trifactor) {
4147     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4148     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4149     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4150     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4151     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4152   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4153     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4154   #endif
4155     PetscCall(PetscFree(*trifactor));
4156   }
4157   PetscFunctionReturn(PETSC_SUCCESS);
4158 }
4159 #endif
4160 
4161 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4162 {
4163   CsrMatrix *mat;
4164 
4165   PetscFunctionBegin;
4166   if (*matstruct) {
4167     if ((*matstruct)->mat) {
4168       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4169 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4170         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4171 #else
4172         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4173         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4174 #endif
4175       } else {
4176         mat = (CsrMatrix *)(*matstruct)->mat;
4177         PetscCall(CsrMatrix_Destroy(&mat));
4178       }
4179     }
4180     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4181     delete (*matstruct)->cprowIndices;
4182     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4183     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4184     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4185 
4186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4187     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4188     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4189     for (int i = 0; i < 3; i++) {
4190       if (mdata->cuSpMV[i].initialized) {
4191         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4192         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4193         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4194       }
4195     }
4196 #endif
4197     delete *matstruct;
4198     *matstruct = NULL;
4199   }
4200   PetscFunctionReturn(PETSC_SUCCESS);
4201 }
4202 
4203 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4204 {
4205   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4206 
4207   PetscFunctionBegin;
4208   if (fs) {
4209 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0)
4210     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4211     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4212     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4213     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4214     delete fs->workVector;
4215     fs->workVector = NULL;
4216 #endif
4217     delete fs->rpermIndices;
4218     delete fs->cpermIndices;
4219     fs->rpermIndices = NULL;
4220     fs->cpermIndices = NULL;
4221     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
4222     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
4223     fs->init_dev_prop = PETSC_FALSE;
4224 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0)
4225     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4226     PetscCallCUDA(cudaFree(fs->csrColIdx));
4227     PetscCallCUDA(cudaFree(fs->csrVal));
4228     PetscCallCUDA(cudaFree(fs->diag));
4229     PetscCallCUDA(cudaFree(fs->X));
4230     PetscCallCUDA(cudaFree(fs->Y));
4231     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4232     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4233     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4234     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4235     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4236     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4237     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4238     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4239     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4240     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4241     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4242     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4243     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4244     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4245     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4246     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4247     PetscCall(PetscFree(fs->csrRowPtr_h));
4248     PetscCall(PetscFree(fs->csrVal_h));
4249     PetscCall(PetscFree(fs->diag_h));
4250     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4251     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4252 #endif
4253   }
4254   PetscFunctionReturn(PETSC_SUCCESS);
4255 }
4256 
4257 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4258 {
4259   PetscFunctionBegin;
4260   if (*trifactors) {
4261     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4262     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4263     PetscCall(PetscFree(*trifactors));
4264   }
4265   PetscFunctionReturn(PETSC_SUCCESS);
4266 }
4267 
4268 struct IJCompare {
4269   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4270   {
4271     if (t1.get<0>() < t2.get<0>()) return true;
4272     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4273     return false;
4274   }
4275 };
4276 
4277 struct IJEqual {
4278   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4279   {
4280     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4281     return true;
4282   }
4283 };
4284 
4285 struct IJDiff {
4286   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
4287 };
4288 
4289 struct IJSum {
4290   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4291 };
4292 
4293 #include <thrust/iterator/discard_iterator.h>
4294 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4295 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4296 {
4297   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4298   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4299   THRUSTARRAY                          *cooPerm_v = NULL;
4300   thrust::device_ptr<const PetscScalar> d_v;
4301   CsrMatrix                            *matrix;
4302   PetscInt                              n;
4303 
4304   PetscFunctionBegin;
4305   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4306   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4307   if (!cusp->cooPerm) {
4308     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4309     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4310     PetscFunctionReturn(PETSC_SUCCESS);
4311   }
4312   matrix = (CsrMatrix *)cusp->mat->mat;
4313   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4314   if (!v) {
4315     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4316     goto finalize;
4317   }
4318   n = cusp->cooPerm->size();
4319   if (isCudaMem(v)) {
4320     d_v = thrust::device_pointer_cast(v);
4321   } else {
4322     cooPerm_v = new THRUSTARRAY(n);
4323     cooPerm_v->assign(v, v + n);
4324     d_v = cooPerm_v->data();
4325     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4326   }
4327   PetscCall(PetscLogGpuTimeBegin());
4328   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4329     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4330       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4331       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4332       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4333         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4334         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4335       */
4336       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4337       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4338       delete cooPerm_w;
4339     } else {
4340       /* all nonzeros in d_v[] are unique entries */
4341       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4342       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4343       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4344     }
4345   } else {
4346     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4347       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4348       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4349     } else {
4350       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4351       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4352       thrust::for_each(zibit, zieit, VecCUDAEquals());
4353     }
4354   }
4355   PetscCall(PetscLogGpuTimeEnd());
4356 finalize:
4357   delete cooPerm_v;
4358   A->offloadmask = PETSC_OFFLOAD_GPU;
4359   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4360   /* shorter version of MatAssemblyEnd_SeqAIJ */
4361   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4362   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4363   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4364   a->reallocs = 0;
4365   A->info.mallocs += 0;
4366   A->info.nz_unneeded = 0;
4367   A->assembled = A->was_assembled = PETSC_TRUE;
4368   A->num_ass++;
4369   PetscFunctionReturn(PETSC_SUCCESS);
4370 }
4371 
4372 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4373 {
4374   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4375 
4376   PetscFunctionBegin;
4377   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4378   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4379   if (destroy) {
4380     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4381     delete cusp->csr2csc_i;
4382     cusp->csr2csc_i = NULL;
4383   }
4384   A->transupdated = PETSC_FALSE;
4385   PetscFunctionReturn(PETSC_SUCCESS);
4386 }
4387 
4388 #include <thrust/binary_search.h>
4389 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4390 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4391 {
4392   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4393   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4394   PetscInt            cooPerm_n, nzr = 0;
4395 
4396   PetscFunctionBegin;
4397   PetscCall(PetscLayoutSetUp(A->rmap));
4398   PetscCall(PetscLayoutSetUp(A->cmap));
4399   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4400   if (n != cooPerm_n) {
4401     delete cusp->cooPerm;
4402     delete cusp->cooPerm_a;
4403     cusp->cooPerm   = NULL;
4404     cusp->cooPerm_a = NULL;
4405   }
4406   if (n) {
4407     thrust::device_ptr<PetscInt> d_i, d_j;
4408     PetscInt                    *d_raw_i, *d_raw_j;
4409     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4410     PetscMemType                 imtype, jmtype;
4411 
4412     PetscCall(PetscGetMemType(coo_i, &imtype));
4413     if (PetscMemTypeHost(imtype)) {
4414       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4415       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4416       d_i        = thrust::device_pointer_cast(d_raw_i);
4417       free_raw_i = PETSC_TRUE;
4418       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4419     } else {
4420       d_i = thrust::device_pointer_cast(coo_i);
4421     }
4422 
4423     PetscCall(PetscGetMemType(coo_j, &jmtype));
4424     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4425       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4426       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4427       d_j        = thrust::device_pointer_cast(d_raw_j);
4428       free_raw_j = PETSC_TRUE;
4429       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4430     } else {
4431       d_j = thrust::device_pointer_cast(coo_j);
4432     }
4433 
4434     THRUSTINTARRAY ii(A->rmap->n);
4435 
4436     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4437     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4438 
4439     /* Ex.
4440       n = 6
4441       coo_i = [3,3,1,4,1,4]
4442       coo_j = [3,2,2,5,2,6]
4443     */
4444     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4445     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4446 
4447     PetscCall(PetscLogGpuTimeBegin());
4448     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4449     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4450     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4451     THRUSTINTARRAY w(d_j, d_j + n);
4452 
4453     /*
4454       d_i     = [1,1,3,3,4,4]
4455       d_j     = [2,2,2,3,5,6]
4456       cooPerm = [2,4,1,0,3,5]
4457     */
4458     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4459 
4460     /*
4461       d_i     = [1,3,3,4,4,x]
4462                             ^ekey
4463       d_j     = [2,2,3,5,6,x]
4464                            ^nekye
4465     */
4466     if (nekey == ekey) { /* all entries are unique */
4467       delete cusp->cooPerm_a;
4468       cusp->cooPerm_a = NULL;
4469     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4470       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4471       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4472       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4473       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4474       w[0]                  = 0;
4475       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4476       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4477     }
4478     thrust::counting_iterator<PetscInt> search_begin(0);
4479     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4480                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4481                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4482     PetscCall(PetscLogGpuTimeEnd());
4483 
4484     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4485     a->singlemalloc = PETSC_FALSE;
4486     a->free_a       = PETSC_TRUE;
4487     a->free_ij      = PETSC_TRUE;
4488     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4489     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4490     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4491     a->nz = a->maxnz = a->i[A->rmap->n];
4492     a->rmax          = 0;
4493     PetscCall(PetscMalloc1(a->nz, &a->a));
4494     PetscCall(PetscMalloc1(a->nz, &a->j));
4495     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4496     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4497     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4498     for (PetscInt i = 0; i < A->rmap->n; i++) {
4499       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4500       nzr += (PetscInt) !!(nnzr);
4501       a->ilen[i] = a->imax[i] = nnzr;
4502       a->rmax                 = PetscMax(a->rmax, nnzr);
4503     }
4504     a->nonzerorowcnt = nzr;
4505     A->preallocated  = PETSC_TRUE;
4506     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4507     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4508     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4509     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4510   } else {
4511     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4512   }
4513   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4514 
4515   /* We want to allocate the CUSPARSE struct for matvec now.
4516      The code is so convoluted now that I prefer to copy zeros */
4517   PetscCall(PetscArrayzero(a->a, a->nz));
4518   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4519   A->offloadmask = PETSC_OFFLOAD_CPU;
4520   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4521   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4522   PetscFunctionReturn(PETSC_SUCCESS);
4523 }
4524 
4525 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4526 {
4527   Mat_SeqAIJ         *seq;
4528   Mat_SeqAIJCUSPARSE *dev;
4529   PetscBool           coo_basic = PETSC_TRUE;
4530   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4531 
4532   PetscFunctionBegin;
4533   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4534   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4535   if (coo_i) {
4536     PetscCall(PetscGetMemType(coo_i, &mtype));
4537     if (PetscMemTypeHost(mtype)) {
4538       for (PetscCount k = 0; k < coo_n; k++) {
4539         if (coo_i[k] < 0 || coo_j[k] < 0) {
4540           coo_basic = PETSC_FALSE;
4541           break;
4542         }
4543       }
4544     }
4545   }
4546 
4547   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4548     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4549   } else {
4550     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4551     mat->offloadmask = PETSC_OFFLOAD_CPU;
4552     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4553     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4554     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4555     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4556     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4557     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4558     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4559     dev->use_extended_coo = PETSC_TRUE;
4560   }
4561   PetscFunctionReturn(PETSC_SUCCESS);
4562 }
4563 
4564 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4565 {
4566   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4567   const PetscCount grid_size = gridDim.x * blockDim.x;
4568   for (; i < nnz; i += grid_size) {
4569     PetscScalar sum = 0.0;
4570     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4571     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4572   }
4573 }
4574 
4575 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4576 {
4577   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4578   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4579   PetscCount          Annz = seq->nz;
4580   PetscMemType        memtype;
4581   const PetscScalar  *v1 = v;
4582   PetscScalar        *Aa;
4583 
4584   PetscFunctionBegin;
4585   if (dev->use_extended_coo) {
4586     PetscCall(PetscGetMemType(v, &memtype));
4587     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4588       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4589       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4590     }
4591 
4592     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4593     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4594 
4595     if (Annz) {
4596       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4597       PetscCallCUDA(cudaPeekAtLastError());
4598     }
4599 
4600     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4601     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4602 
4603     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4604   } else {
4605     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4606   }
4607   PetscFunctionReturn(PETSC_SUCCESS);
4608 }
4609 
4610 /*@C
4611     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4612 
4613    Not Collective
4614 
4615     Input Parameters:
4616 +   A - the matrix
4617 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4618 
4619     Output Parameters:
4620 +   i - the CSR row pointers
4621 -   j - the CSR column indices
4622 
4623     Level: developer
4624 
4625     Note:
4626       When compressed is true, the CSR structure does not contain empty rows
4627 
4628 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4629 @*/
4630 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4631 {
4632   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4633   CsrMatrix          *csr;
4634   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4635 
4636   PetscFunctionBegin;
4637   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4638   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4639   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4640   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4641   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4642   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4643   csr = (CsrMatrix *)cusp->mat->mat;
4644   if (i) {
4645     if (!compressed && a->compressedrow.use) { /* need full row offset */
4646       if (!cusp->rowoffsets_gpu) {
4647         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4648         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4649         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4650       }
4651       *i = cusp->rowoffsets_gpu->data().get();
4652     } else *i = csr->row_offsets->data().get();
4653   }
4654   if (j) *j = csr->column_indices->data().get();
4655   PetscFunctionReturn(PETSC_SUCCESS);
4656 }
4657 
4658 /*@C
4659     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4660 
4661    Not Collective
4662 
4663     Input Parameters:
4664 +   A - the matrix
4665 .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4666 .   i - the CSR row pointers
4667 -   j - the CSR column indices
4668 
4669     Level: developer
4670 
4671 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4672 @*/
4673 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4674 {
4675   PetscFunctionBegin;
4676   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4677   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4678   if (i) *i = NULL;
4679   if (j) *j = NULL;
4680   (void)compressed;
4681   PetscFunctionReturn(PETSC_SUCCESS);
4682 }
4683 
4684 /*@C
4685    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4686 
4687    Not Collective
4688 
4689    Input Parameter:
4690 .   A - a `MATSEQAIJCUSPARSE` matrix
4691 
4692    Output Parameter:
4693 .   a - pointer to the device data
4694 
4695    Level: developer
4696 
4697    Note:
4698    May trigger host-device copies if up-to-date matrix data is on host
4699 
4700 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4701 @*/
4702 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4703 {
4704   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4705   CsrMatrix          *csr;
4706 
4707   PetscFunctionBegin;
4708   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4709   PetscValidPointer(a, 2);
4710   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4711   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4712   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4713   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4714   csr = (CsrMatrix *)cusp->mat->mat;
4715   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4716   *a = csr->values->data().get();
4717   PetscFunctionReturn(PETSC_SUCCESS);
4718 }
4719 
4720 /*@C
4721    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4722 
4723    Not Collective
4724 
4725    Input Parameters:
4726 +   A - a `MATSEQAIJCUSPARSE` matrix
4727 -   a - pointer to the device data
4728 
4729    Level: developer
4730 
4731 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4732 @*/
4733 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4734 {
4735   PetscFunctionBegin;
4736   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4737   PetscValidPointer(a, 2);
4738   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4739   *a = NULL;
4740   PetscFunctionReturn(PETSC_SUCCESS);
4741 }
4742 
4743 /*@C
4744    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4745 
4746    Not Collective
4747 
4748    Input Parameter:
4749 .   A - a `MATSEQAIJCUSPARSE` matrix
4750 
4751    Output Parameter:
4752 .   a - pointer to the device data
4753 
4754    Level: developer
4755 
4756    Note:
4757    May trigger host-device copies if up-to-date matrix data is on host
4758 
4759 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4760 @*/
4761 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4762 {
4763   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4764   CsrMatrix          *csr;
4765 
4766   PetscFunctionBegin;
4767   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4768   PetscValidPointer(a, 2);
4769   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4770   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4771   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4772   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4773   csr = (CsrMatrix *)cusp->mat->mat;
4774   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4775   *a             = csr->values->data().get();
4776   A->offloadmask = PETSC_OFFLOAD_GPU;
4777   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4778   PetscFunctionReturn(PETSC_SUCCESS);
4779 }
4780 /*@C
4781    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4782 
4783    Not Collective
4784 
4785    Input Parameters:
4786 +   A - a `MATSEQAIJCUSPARSE` matrix
4787 -   a - pointer to the device data
4788 
4789    Level: developer
4790 
4791 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4792 @*/
4793 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4794 {
4795   PetscFunctionBegin;
4796   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4797   PetscValidPointer(a, 2);
4798   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4799   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4800   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4801   *a = NULL;
4802   PetscFunctionReturn(PETSC_SUCCESS);
4803 }
4804 
4805 /*@C
4806    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4807 
4808    Not Collective
4809 
4810    Input Parameter:
4811 .   A - a `MATSEQAIJCUSPARSE` matrix
4812 
4813    Output Parameter:
4814 .   a - pointer to the device data
4815 
4816    Level: developer
4817 
4818    Note:
4819    Does not trigger host-device copies and flags data validity on the GPU
4820 
4821 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4822 @*/
4823 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4824 {
4825   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4826   CsrMatrix          *csr;
4827 
4828   PetscFunctionBegin;
4829   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4830   PetscValidPointer(a, 2);
4831   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4832   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4833   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4834   csr = (CsrMatrix *)cusp->mat->mat;
4835   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4836   *a             = csr->values->data().get();
4837   A->offloadmask = PETSC_OFFLOAD_GPU;
4838   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4839   PetscFunctionReturn(PETSC_SUCCESS);
4840 }
4841 
4842 /*@C
4843    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4844 
4845    Not Collective
4846 
4847    Input Parameters:
4848 +   A - a `MATSEQAIJCUSPARSE` matrix
4849 -   a - pointer to the device data
4850 
4851    Level: developer
4852 
4853 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4854 @*/
4855 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4856 {
4857   PetscFunctionBegin;
4858   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4859   PetscValidPointer(a, 2);
4860   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4861   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4862   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4863   *a = NULL;
4864   PetscFunctionReturn(PETSC_SUCCESS);
4865 }
4866 
4867 struct IJCompare4 {
4868   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4869   {
4870     if (t1.get<0>() < t2.get<0>()) return true;
4871     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4872     return false;
4873   }
4874 };
4875 
4876 struct Shift {
4877   int _shift;
4878 
4879   Shift(int shift) : _shift(shift) { }
4880   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4881 };
4882 
4883 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4884 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4885 {
4886   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4887   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4888   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4889   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4890   PetscInt                      Annz, Bnnz;
4891   cusparseStatus_t              stat;
4892   PetscInt                      i, m, n, zero = 0;
4893 
4894   PetscFunctionBegin;
4895   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4896   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4897   PetscValidPointer(C, 4);
4898   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4899   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4900   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4901   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4902   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4903   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4904   if (reuse == MAT_INITIAL_MATRIX) {
4905     m = A->rmap->n;
4906     n = A->cmap->n + B->cmap->n;
4907     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4908     PetscCall(MatSetSizes(*C, m, n, m, n));
4909     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4910     c                       = (Mat_SeqAIJ *)(*C)->data;
4911     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4912     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4913     Ccsr                    = new CsrMatrix;
4914     Cmat->cprowIndices      = NULL;
4915     c->compressedrow.use    = PETSC_FALSE;
4916     c->compressedrow.nrows  = 0;
4917     c->compressedrow.i      = NULL;
4918     c->compressedrow.rindex = NULL;
4919     Ccusp->workVector       = NULL;
4920     Ccusp->nrows            = m;
4921     Ccusp->mat              = Cmat;
4922     Ccusp->mat->mat         = Ccsr;
4923     Ccsr->num_rows          = m;
4924     Ccsr->num_cols          = n;
4925     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4926     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4927     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4928     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4929     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4930     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4931     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4932     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4933     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4934     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4935     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4936     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4937     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4938 
4939     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4940     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4941     Annz                 = (PetscInt)Acsr->column_indices->size();
4942     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4943     c->nz                = Annz + Bnnz;
4944     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4945     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4946     Ccsr->values         = new THRUSTARRAY(c->nz);
4947     Ccsr->num_entries    = c->nz;
4948     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4949     if (c->nz) {
4950       auto              Acoo = new THRUSTINTARRAY32(Annz);
4951       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4952       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4953       THRUSTINTARRAY32 *Aroff, *Broff;
4954 
4955       if (a->compressedrow.use) { /* need full row offset */
4956         if (!Acusp->rowoffsets_gpu) {
4957           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4958           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4959           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4960         }
4961         Aroff = Acusp->rowoffsets_gpu;
4962       } else Aroff = Acsr->row_offsets;
4963       if (b->compressedrow.use) { /* need full row offset */
4964         if (!Bcusp->rowoffsets_gpu) {
4965           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4966           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4967           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4968         }
4969         Broff = Bcusp->rowoffsets_gpu;
4970       } else Broff = Bcsr->row_offsets;
4971       PetscCall(PetscLogGpuTimeBegin());
4972       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4973       PetscCallCUSPARSE(stat);
4974       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4975       PetscCallCUSPARSE(stat);
4976       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4977       auto Aperm = thrust::make_constant_iterator(1);
4978       auto Bperm = thrust::make_constant_iterator(0);
4979 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4980       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4981       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4982 #else
4983       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4984       auto Bcib = Bcsr->column_indices->begin();
4985       auto Bcie = Bcsr->column_indices->end();
4986       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4987 #endif
4988       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4989       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4990       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4991       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4992       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4993       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4994       auto p1    = Ccusp->cooPerm->begin();
4995       auto p2    = Ccusp->cooPerm->begin();
4996       thrust::advance(p2, Annz);
4997       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4998 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4999       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
5000 #endif
5001       auto cci = thrust::make_counting_iterator(zero);
5002       auto cce = thrust::make_counting_iterator(c->nz);
5003 #if 0 //Errors on SUMMIT cuda 11.1.0
5004       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5005 #else
5006       auto pred = thrust::identity<int>();
5007       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
5008       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
5009 #endif
5010       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
5011       PetscCallCUSPARSE(stat);
5012       PetscCall(PetscLogGpuTimeEnd());
5013       delete wPerm;
5014       delete Acoo;
5015       delete Bcoo;
5016       delete Ccoo;
5017 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5018       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5019       PetscCallCUSPARSE(stat);
5020 #endif
5021       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5022         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5023         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5024         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5025         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5026         CsrMatrix                    *CcsrT = new CsrMatrix;
5027         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5028         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5029 
5030         (*C)->form_explicit_transpose = PETSC_TRUE;
5031         (*C)->transupdated            = PETSC_TRUE;
5032         Ccusp->rowoffsets_gpu         = NULL;
5033         CmatT->cprowIndices           = NULL;
5034         CmatT->mat                    = CcsrT;
5035         CcsrT->num_rows               = n;
5036         CcsrT->num_cols               = m;
5037         CcsrT->num_entries            = c->nz;
5038 
5039         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
5040         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5041         CcsrT->values         = new THRUSTARRAY(c->nz);
5042 
5043         PetscCall(PetscLogGpuTimeBegin());
5044         auto rT = CcsrT->row_offsets->begin();
5045         if (AT) {
5046           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
5047           thrust::advance(rT, -1);
5048         }
5049         if (BT) {
5050           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
5051           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
5052           thrust::copy(titb, tite, rT);
5053         }
5054         auto cT = CcsrT->column_indices->begin();
5055         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
5056         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
5057         auto vT = CcsrT->values->begin();
5058         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5059         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5060         PetscCall(PetscLogGpuTimeEnd());
5061 
5062         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5063         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5064         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5065         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
5066         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
5067         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5068         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5069         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5070         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5071 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5072         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5073         PetscCallCUSPARSE(stat);
5074 #endif
5075         Ccusp->matTranspose = CmatT;
5076       }
5077     }
5078 
5079     c->singlemalloc = PETSC_FALSE;
5080     c->free_a       = PETSC_TRUE;
5081     c->free_ij      = PETSC_TRUE;
5082     PetscCall(PetscMalloc1(m + 1, &c->i));
5083     PetscCall(PetscMalloc1(c->nz, &c->j));
5084     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5085       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5086       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5087       ii = *Ccsr->row_offsets;
5088       jj = *Ccsr->column_indices;
5089       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5090       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5091     } else {
5092       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5093       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5094     }
5095     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
5096     PetscCall(PetscMalloc1(m, &c->ilen));
5097     PetscCall(PetscMalloc1(m, &c->imax));
5098     c->maxnz         = c->nz;
5099     c->nonzerorowcnt = 0;
5100     c->rmax          = 0;
5101     for (i = 0; i < m; i++) {
5102       const PetscInt nn = c->i[i + 1] - c->i[i];
5103       c->ilen[i] = c->imax[i] = nn;
5104       c->nonzerorowcnt += (PetscInt) !!nn;
5105       c->rmax = PetscMax(c->rmax, nn);
5106     }
5107     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5108     PetscCall(PetscMalloc1(c->nz, &c->a));
5109     (*C)->nonzerostate++;
5110     PetscCall(PetscLayoutSetUp((*C)->rmap));
5111     PetscCall(PetscLayoutSetUp((*C)->cmap));
5112     Ccusp->nonzerostate = (*C)->nonzerostate;
5113     (*C)->preallocated  = PETSC_TRUE;
5114   } else {
5115     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5116     c = (Mat_SeqAIJ *)(*C)->data;
5117     if (c->nz) {
5118       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
5119       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
5120       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
5121       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
5122       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5123       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5124       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5125       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5126       Acsr = (CsrMatrix *)Acusp->mat->mat;
5127       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5128       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5129       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5130       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5131       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5132       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
5133       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
5134       auto pmid = Ccusp->cooPerm->begin();
5135       thrust::advance(pmid, Acsr->num_entries);
5136       PetscCall(PetscLogGpuTimeBegin());
5137       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
5138       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5139       thrust::for_each(zibait, zieait, VecCUDAEquals());
5140       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5141       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
5142       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5143       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5144       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5145         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5146         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5147         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5148         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5149         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5150         auto       vT    = CcsrT->values->begin();
5151         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5152         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5153         (*C)->transupdated = PETSC_TRUE;
5154       }
5155       PetscCall(PetscLogGpuTimeEnd());
5156     }
5157   }
5158   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5159   (*C)->assembled     = PETSC_TRUE;
5160   (*C)->was_assembled = PETSC_FALSE;
5161   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5162   PetscFunctionReturn(PETSC_SUCCESS);
5163 }
5164 
5165 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5166 {
5167   bool               dmem;
5168   const PetscScalar *av;
5169 
5170   PetscFunctionBegin;
5171   dmem = isCudaMem(v);
5172   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5173   if (n && idx) {
5174     THRUSTINTARRAY widx(n);
5175     widx.assign(idx, idx + n);
5176     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5177 
5178     THRUSTARRAY                    *w = NULL;
5179     thrust::device_ptr<PetscScalar> dv;
5180     if (dmem) {
5181       dv = thrust::device_pointer_cast(v);
5182     } else {
5183       w  = new THRUSTARRAY(n);
5184       dv = w->data();
5185     }
5186     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5187 
5188     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5189     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5190     thrust::for_each(zibit, zieit, VecCUDAEquals());
5191     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5192     delete w;
5193   } else {
5194     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5195   }
5196   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5197   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5198   PetscFunctionReturn(PETSC_SUCCESS);
5199 }
5200