xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 174dc0c8cee294b82b85e4dd3b331b29396264fc)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73 #endif
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
89 
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
92 
93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96 
97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98 {
99   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
100 
101   PetscFunctionBegin;
102   switch (op) {
103   case MAT_CUSPARSE_MULT:
104     cusparsestruct->format = format;
105     break;
106   case MAT_CUSPARSE_ALL:
107     cusparsestruct->format = format;
108     break;
109   default:
110     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111   }
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
115 /*@
116   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
117   operation. Only the `MatMult()` operation can use different GPU storage formats
118 
119   Not Collective
120 
121   Input Parameters:
122 + A      - Matrix of type `MATSEQAIJCUSPARSE`
123 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
124         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126 
127   Level: intermediate
128 
129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130 @*/
131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132 {
133   PetscFunctionBegin;
134   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140 {
141   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142 
143   PetscFunctionBegin;
144   cusparsestruct->use_cpu_solve = use_cpu;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150 
151   Input Parameters:
152 + A       - Matrix of type `MATSEQAIJCUSPARSE`
153 - use_cpu - set flag for using the built-in CPU `MatSolve()`
154 
155   Level: intermediate
156 
157   Note:
158   The cuSparse LU solver currently computes the factors with the built-in CPU method
159   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161 
162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163 @*/
164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165 {
166   PetscFunctionBegin;
167   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173 {
174   PetscFunctionBegin;
175   switch (op) {
176   case MAT_FORM_EXPLICIT_TRANSPOSE:
177     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
178     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
179     A->form_explicit_transpose = flg;
180     break;
181   default:
182     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183     break;
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
189 {
190   MatCUSPARSEStorageFormat format;
191   PetscBool                flg;
192   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
193 
194   PetscFunctionBegin;
195   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
196   if (A->factortype == MAT_FACTOR_NONE) {
197     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
198     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199 
200     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
201     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
202     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
203     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
205     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209   #else
210     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211   #endif
212     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214 
215     PetscCall(
216       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218 #endif
219   }
220   PetscOptionsHeadEnd();
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226 {
227   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228   PetscInt                      m  = A->rmap->n;
229   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231   const MatScalar              *Aa = a->a;
232   PetscInt                     *Mi, *Mj, Mnz;
233   PetscScalar                  *Ma;
234 
235   PetscFunctionBegin;
236   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240       PetscCall(PetscMalloc1(m + 1, &Mi));
241       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242       PetscCall(PetscMalloc1(Mnz, &Ma));
243       Mi[0] = 0;
244       for (PetscInt i = 0; i < m; i++) {
245         PetscInt llen = Ai[i + 1] - Ai[i];
246         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250         Mi[i + 1] = Mi[i] + llen + ulen;
251       }
252       // Copy M (L,U) from host to device
253       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258 
259       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267 
268       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271 
272       fillMode = CUSPARSE_FILL_MODE_UPPER;
273       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277 
278       // Allocate work vectors in SpSv
279       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281 
282       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284 
285       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292 
293       // Record for reuse
294       fs->csrRowPtr_h = Mi;
295       fs->csrVal_h    = Ma;
296       PetscCall(PetscFree(Mj));
297     }
298     // Copy the value
299     Mi  = fs->csrRowPtr_h;
300     Ma  = fs->csrVal_h;
301     Mnz = Mi[m];
302     for (PetscInt i = 0; i < m; i++) {
303       PetscInt llen = Ai[i + 1] - Ai[i];
304       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308     }
309     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310 
311   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
312     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
313       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
314       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
315       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
316     } else
317   #endif
318     {
319       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
320       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
321 
322       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
323       fs->updatedSpSVAnalysis          = PETSC_TRUE;
324       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
325     }
326   }
327   PetscFunctionReturn(PETSC_SUCCESS);
328 }
329 #else
330 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
331 {
332   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
333   PetscInt                           n                  = A->rmap->n;
334   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
335   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
336   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
337   const MatScalar                   *aa = a->a, *v;
338   PetscInt                          *AiLo, *AjLo;
339   PetscInt                           i, nz, nzLower, offset, rowOffset;
340 
341   PetscFunctionBegin;
342   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
343   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
344     try {
345       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
346       nzLower = n + ai[n] - ai[1];
347       if (!loTriFactor) {
348         PetscScalar *AALo;
349 
350         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
351 
352         /* Allocate Space for the lower triangular matrix */
353         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
354         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
355 
356         /* Fill the lower triangular matrix */
357         AiLo[0]   = (PetscInt)0;
358         AiLo[n]   = nzLower;
359         AjLo[0]   = (PetscInt)0;
360         AALo[0]   = (MatScalar)1.0;
361         v         = aa;
362         vi        = aj;
363         offset    = 1;
364         rowOffset = 1;
365         for (i = 1; i < n; i++) {
366           nz = ai[i + 1] - ai[i];
367           /* additional 1 for the term on the diagonal */
368           AiLo[i] = rowOffset;
369           rowOffset += nz + 1;
370 
371           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
372           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
373 
374           offset += nz;
375           AjLo[offset] = (PetscInt)i;
376           AALo[offset] = (MatScalar)1.0;
377           offset += 1;
378 
379           v += nz;
380           vi += nz;
381         }
382 
383         /* allocate space for the triangular factor information */
384         PetscCall(PetscNew(&loTriFactor));
385         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
386         /* Create the matrix description */
387         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
388         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
389   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
390         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
391   #else
392         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
393   #endif
394         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
395         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
396 
397         /* set the operation */
398         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399 
400         /* set the matrix */
401         loTriFactor->csrMat              = new CsrMatrix;
402         loTriFactor->csrMat->num_rows    = n;
403         loTriFactor->csrMat->num_cols    = n;
404         loTriFactor->csrMat->num_entries = nzLower;
405 
406         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
407         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
408 
409         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
411 
412         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
414 
415         /* Create the solve analysis information */
416         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
417         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
418   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
419         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
420                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
421         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
422   #endif
423 
424         /* perform the solve analysis */
425         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
426                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
427         PetscCallCUDA(WaitForCUDA());
428         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
429 
430         /* assign the pointer */
431         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
432         loTriFactor->AA_h                                          = AALo;
433         PetscCallCUDA(cudaFreeHost(AiLo));
434         PetscCallCUDA(cudaFreeHost(AjLo));
435         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
436       } else { /* update values only */
437         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
438         /* Fill the lower triangular matrix */
439         loTriFactor->AA_h[0] = 1.0;
440         v                    = aa;
441         vi                   = aj;
442         offset               = 1;
443         for (i = 1; i < n; i++) {
444           nz = ai[i + 1] - ai[i];
445           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
446           offset += nz;
447           loTriFactor->AA_h[offset] = 1.0;
448           offset += 1;
449           v += nz;
450         }
451         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
452         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
453       }
454     } catch (char *ex) {
455       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
456     }
457   }
458   PetscFunctionReturn(PETSC_SUCCESS);
459 }
460 
461 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
462 {
463   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
464   PetscInt                           n                  = A->rmap->n;
465   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
466   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
467   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
468   const MatScalar                   *aa = a->a, *v;
469   PetscInt                          *AiUp, *AjUp;
470   PetscInt                           i, nz, nzUpper, offset;
471 
472   PetscFunctionBegin;
473   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
474   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
475     try {
476       /* next, figure out the number of nonzeros in the upper triangular matrix. */
477       nzUpper = adiag[0] - adiag[n];
478       if (!upTriFactor) {
479         PetscScalar *AAUp;
480 
481         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
482 
483         /* Allocate Space for the upper triangular matrix */
484         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
485         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
486 
487         /* Fill the upper triangular matrix */
488         AiUp[0] = (PetscInt)0;
489         AiUp[n] = nzUpper;
490         offset  = nzUpper;
491         for (i = n - 1; i >= 0; i--) {
492           v  = aa + adiag[i + 1] + 1;
493           vi = aj + adiag[i + 1] + 1;
494 
495           /* number of elements NOT on the diagonal */
496           nz = adiag[i] - adiag[i + 1] - 1;
497 
498           /* decrement the offset */
499           offset -= (nz + 1);
500 
501           /* first, set the diagonal elements */
502           AjUp[offset] = (PetscInt)i;
503           AAUp[offset] = (MatScalar)1. / v[nz];
504           AiUp[i]      = AiUp[i + 1] - (nz + 1);
505 
506           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
507           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
508         }
509 
510         /* allocate space for the triangular factor information */
511         PetscCall(PetscNew(&upTriFactor));
512         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
513 
514         /* Create the matrix description */
515         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
516         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
517   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
518         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
519   #else
520         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
521   #endif
522         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
523         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
524 
525         /* set the operation */
526         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
527 
528         /* set the matrix */
529         upTriFactor->csrMat              = new CsrMatrix;
530         upTriFactor->csrMat->num_rows    = n;
531         upTriFactor->csrMat->num_cols    = n;
532         upTriFactor->csrMat->num_entries = nzUpper;
533 
534         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
535         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
536 
537         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
538         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
539 
540         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
541         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
542 
543         /* Create the solve analysis information */
544         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
545         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
546   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
547         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
548                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
549         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
550   #endif
551 
552         /* perform the solve analysis */
553         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
554                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
555 
556         PetscCallCUDA(WaitForCUDA());
557         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
558 
559         /* assign the pointer */
560         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
561         upTriFactor->AA_h                                          = AAUp;
562         PetscCallCUDA(cudaFreeHost(AiUp));
563         PetscCallCUDA(cudaFreeHost(AjUp));
564         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
565       } else {
566         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
567         /* Fill the upper triangular matrix */
568         offset = nzUpper;
569         for (i = n - 1; i >= 0; i--) {
570           v = aa + adiag[i + 1] + 1;
571 
572           /* number of elements NOT on the diagonal */
573           nz = adiag[i] - adiag[i + 1] - 1;
574 
575           /* decrement the offset */
576           offset -= (nz + 1);
577 
578           /* first, set the diagonal elements */
579           upTriFactor->AA_h[offset] = 1. / v[nz];
580           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
581         }
582         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
583         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
584       }
585     } catch (char *ex) {
586       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
587     }
588   }
589   PetscFunctionReturn(PETSC_SUCCESS);
590 }
591 #endif
592 
593 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
594 {
595   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
596   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
597   IS                            isrow = a->row, isicol = a->icol;
598   PetscBool                     row_identity, col_identity;
599   PetscInt                      n = A->rmap->n;
600 
601   PetscFunctionBegin;
602   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
603 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
604   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
605 #else
606   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
607   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
608   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
609 #endif
610 
611   cusparseTriFactors->nnz = a->nz;
612 
613   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
614   /* lower triangular indices */
615   PetscCall(ISIdentity(isrow, &row_identity));
616   if (!row_identity && !cusparseTriFactors->rpermIndices) {
617     const PetscInt *r;
618 
619     PetscCall(ISGetIndices(isrow, &r));
620     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
621     cusparseTriFactors->rpermIndices->assign(r, r + n);
622     PetscCall(ISRestoreIndices(isrow, &r));
623     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
624   }
625 
626   /* upper triangular indices */
627   PetscCall(ISIdentity(isicol, &col_identity));
628   if (!col_identity && !cusparseTriFactors->cpermIndices) {
629     const PetscInt *c;
630 
631     PetscCall(ISGetIndices(isicol, &c));
632     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
633     cusparseTriFactors->cpermIndices->assign(c, c + n);
634     PetscCall(ISRestoreIndices(isicol, &c));
635     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
636   }
637   PetscFunctionReturn(PETSC_SUCCESS);
638 }
639 
640 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
641 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
642 {
643   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
644   PetscInt                      m  = A->rmap->n;
645   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
646   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
647   const MatScalar              *Aa = a->a;
648   PetscInt                     *Mj, Mnz;
649   PetscScalar                  *Ma, *D;
650 
651   PetscFunctionBegin;
652   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
653     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
654       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
655       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
656       Mnz = Ai[m]; // Unz (with the unit diagonal)
657       PetscCall(PetscMalloc1(Mnz, &Ma));
658       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
659       PetscCall(PetscMalloc1(m, &D));    // the diagonal
660       for (PetscInt i = 0; i < m; i++) {
661         PetscInt ulen = Ai[i + 1] - Ai[i];
662         Mj[Ai[i]]     = i;                                              // diagonal entry
663         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
664       }
665       // Copy M (U) from host to device
666       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
667       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
668       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
669       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
670       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
671       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
672 
673       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
674       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
675       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
676       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
677       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
678       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
679       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
680       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
681 
682       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
683       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
684       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
685 
686       // Allocate work vectors in SpSv
687       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
688       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
689 
690       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
691       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
692 
693       // Query buffer sizes for SpSV and then allocate buffers
694       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
695       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
696       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
697 
698       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
699       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
700       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
701 
702       // Record for reuse
703       fs->csrVal_h = Ma;
704       fs->diag_h   = D;
705       PetscCall(PetscFree(Mj));
706     }
707     // Copy the value
708     Ma  = fs->csrVal_h;
709     D   = fs->diag_h;
710     Mnz = Ai[m];
711     for (PetscInt i = 0; i < m; i++) {
712       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
713       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
714       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
715     }
716     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
717     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
718 
719   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
720     if (fs->updatedSpSVAnalysis) {
721       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
722       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
723     } else
724   #endif
725     {
726       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
727       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
728       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
729       fs->updatedSpSVAnalysis = PETSC_TRUE;
730     }
731   }
732   PetscFunctionReturn(PETSC_SUCCESS);
733 }
734 
735 // Solve Ut D U x = b
736 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
737 {
738   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
739   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
740   const PetscScalar                    *barray;
741   PetscScalar                          *xarray;
742   thrust::device_ptr<const PetscScalar> bGPU;
743   thrust::device_ptr<PetscScalar>       xGPU;
744   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
745   PetscInt                              m   = A->rmap->n;
746 
747   PetscFunctionBegin;
748   PetscCall(PetscLogGpuTimeBegin());
749   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
750   PetscCall(VecCUDAGetArrayRead(b, &barray));
751   xGPU = thrust::device_pointer_cast(xarray);
752   bGPU = thrust::device_pointer_cast(barray);
753 
754   // Reorder b with the row permutation if needed, and wrap the result in fs->X
755   if (fs->rpermIndices) {
756     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
757     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
758   } else {
759     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
760   }
761 
762   // Solve Ut Y = X
763   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
764   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
765 
766   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
767   // It is basically a vector element-wise multiplication, but cublas does not have it!
768   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
769 
770   // Solve U X = Y
771   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
772     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
773   } else {
774     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
775   }
776   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
777 
778   // Reorder X with the column permutation if needed, and put the result back to x
779   if (fs->cpermIndices) {
780     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
781                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
782   }
783 
784   PetscCall(VecCUDARestoreArrayRead(b, &barray));
785   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
786   PetscCall(PetscLogGpuTimeEnd());
787   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
788   PetscFunctionReturn(PETSC_SUCCESS);
789 }
790 #else
791 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
792 {
793   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
794   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
795   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
796   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
797   PetscInt                          *AiUp, *AjUp;
798   PetscScalar                       *AAUp;
799   PetscScalar                       *AALo;
800   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
801   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
802   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
803   const MatScalar                   *aa = b->a, *v;
804 
805   PetscFunctionBegin;
806   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
807   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
808     try {
809       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
810       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
811       if (!upTriFactor && !loTriFactor) {
812         /* Allocate Space for the upper triangular matrix */
813         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
814         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
815 
816         /* Fill the upper triangular matrix */
817         AiUp[0] = (PetscInt)0;
818         AiUp[n] = nzUpper;
819         offset  = 0;
820         for (i = 0; i < n; i++) {
821           /* set the pointers */
822           v  = aa + ai[i];
823           vj = aj + ai[i];
824           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
825 
826           /* first, set the diagonal elements */
827           AjUp[offset] = (PetscInt)i;
828           AAUp[offset] = (MatScalar)1.0 / v[nz];
829           AiUp[i]      = offset;
830           AALo[offset] = (MatScalar)1.0 / v[nz];
831 
832           offset += 1;
833           if (nz > 0) {
834             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
835             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
836             for (j = offset; j < offset + nz; j++) {
837               AAUp[j] = -AAUp[j];
838               AALo[j] = AAUp[j] / v[nz];
839             }
840             offset += nz;
841           }
842         }
843 
844         /* allocate space for the triangular factor information */
845         PetscCall(PetscNew(&upTriFactor));
846         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
847 
848         /* Create the matrix description */
849         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
850         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
851   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
852         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
853   #else
854         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
855   #endif
856         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
857         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
858 
859         /* set the matrix */
860         upTriFactor->csrMat              = new CsrMatrix;
861         upTriFactor->csrMat->num_rows    = A->rmap->n;
862         upTriFactor->csrMat->num_cols    = A->cmap->n;
863         upTriFactor->csrMat->num_entries = a->nz;
864 
865         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
866         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
867 
868         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
869         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
870 
871         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
872         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
873 
874         /* set the operation */
875         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
876 
877         /* Create the solve analysis information */
878         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
879         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
880   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
881         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
882                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
883         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
884   #endif
885 
886         /* perform the solve analysis */
887         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
888                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
889 
890         PetscCallCUDA(WaitForCUDA());
891         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
892 
893         /* assign the pointer */
894         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
895 
896         /* allocate space for the triangular factor information */
897         PetscCall(PetscNew(&loTriFactor));
898         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
899 
900         /* Create the matrix description */
901         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
902         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
903   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
904         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
905   #else
906         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
907   #endif
908         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
909         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
910 
911         /* set the operation */
912         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
913 
914         /* set the matrix */
915         loTriFactor->csrMat              = new CsrMatrix;
916         loTriFactor->csrMat->num_rows    = A->rmap->n;
917         loTriFactor->csrMat->num_cols    = A->cmap->n;
918         loTriFactor->csrMat->num_entries = a->nz;
919 
920         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
921         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
922 
923         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
924         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
925 
926         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
927         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
928 
929         /* Create the solve analysis information */
930         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
931         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
932   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
933         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
934                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
935         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
936   #endif
937 
938         /* perform the solve analysis */
939         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
940                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
941 
942         PetscCallCUDA(WaitForCUDA());
943         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
944 
945         /* assign the pointer */
946         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
947 
948         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
949         PetscCallCUDA(cudaFreeHost(AiUp));
950         PetscCallCUDA(cudaFreeHost(AjUp));
951       } else {
952         /* Fill the upper triangular matrix */
953         offset = 0;
954         for (i = 0; i < n; i++) {
955           /* set the pointers */
956           v  = aa + ai[i];
957           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
958 
959           /* first, set the diagonal elements */
960           AAUp[offset] = 1.0 / v[nz];
961           AALo[offset] = 1.0 / v[nz];
962 
963           offset += 1;
964           if (nz > 0) {
965             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
966             for (j = offset; j < offset + nz; j++) {
967               AAUp[j] = -AAUp[j];
968               AALo[j] = AAUp[j] / v[nz];
969             }
970             offset += nz;
971           }
972         }
973         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
974         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
975         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
976         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
977         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
978       }
979       PetscCallCUDA(cudaFreeHost(AAUp));
980       PetscCallCUDA(cudaFreeHost(AALo));
981     } catch (char *ex) {
982       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
983     }
984   }
985   PetscFunctionReturn(PETSC_SUCCESS);
986 }
987 #endif
988 
989 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
990 {
991   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
992   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
993   IS                            ip                 = a->row;
994   PetscBool                     perm_identity;
995   PetscInt                      n = A->rmap->n;
996 
997   PetscFunctionBegin;
998   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
999 
1000 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1001   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1002 #else
1003   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1004   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1005 #endif
1006   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1007 
1008   A->offloadmask = PETSC_OFFLOAD_BOTH;
1009 
1010   /* lower triangular indices */
1011   PetscCall(ISIdentity(ip, &perm_identity));
1012   if (!perm_identity) {
1013     IS              iip;
1014     const PetscInt *irip, *rip;
1015 
1016     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1017     PetscCall(ISGetIndices(iip, &irip));
1018     PetscCall(ISGetIndices(ip, &rip));
1019     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1020     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1021     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1022     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1023     PetscCall(ISRestoreIndices(iip, &irip));
1024     PetscCall(ISDestroy(&iip));
1025     PetscCall(ISRestoreIndices(ip, &rip));
1026     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1027   }
1028   PetscFunctionReturn(PETSC_SUCCESS);
1029 }
1030 
1031 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1032 {
1033   PetscFunctionBegin;
1034   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1035   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1036   B->offloadmask = PETSC_OFFLOAD_CPU;
1037 
1038 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1039   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1040   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1041 #else
1042   /* determine which version of MatSolve needs to be used. */
1043   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1044   IS          ip = b->row;
1045   PetscBool   perm_identity;
1046 
1047   PetscCall(ISIdentity(ip, &perm_identity));
1048   if (perm_identity) {
1049     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1050     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1051   } else {
1052     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1053     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1054   }
1055 #endif
1056   B->ops->matsolve          = NULL;
1057   B->ops->matsolvetranspose = NULL;
1058 
1059   /* get the triangular factors */
1060   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1061   PetscFunctionReturn(PETSC_SUCCESS);
1062 }
1063 
1064 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1065 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1066 {
1067   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1070   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1071   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1072   cusparseIndexBase_t                indexBase;
1073   cusparseMatrixType_t               matrixType;
1074   cusparseFillMode_t                 fillMode;
1075   cusparseDiagType_t                 diagType;
1076 
1077   PetscFunctionBegin;
1078   /* allocate space for the transpose of the lower triangular factor */
1079   PetscCall(PetscNew(&loTriFactorT));
1080   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1081 
1082   /* set the matrix descriptors of the lower triangular factor */
1083   matrixType = cusparseGetMatType(loTriFactor->descr);
1084   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1085   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1086   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1087 
1088   /* Create the matrix description */
1089   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1090   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1091   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1092   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1093   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1094 
1095   /* set the operation */
1096   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1097 
1098   /* allocate GPU space for the CSC of the lower triangular factor*/
1099   loTriFactorT->csrMat                 = new CsrMatrix;
1100   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1101   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1102   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1103   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1104   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1105   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1106 
1107   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1108   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1109   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1110                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1111                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1112   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1113   #endif
1114 
1115   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1116   {
1117     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1118     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1119                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1120   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1121                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1122   #else
1123                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1124   #endif
1125     PetscCallCUSPARSE(stat);
1126   }
1127 
1128   PetscCallCUDA(WaitForCUDA());
1129   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1130 
1131   /* Create the solve analysis information */
1132   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1133   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1134   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1135   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1136                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1137   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1138   #endif
1139 
1140   /* perform the solve analysis */
1141   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1142                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1143 
1144   PetscCallCUDA(WaitForCUDA());
1145   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1146 
1147   /* assign the pointer */
1148   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1149 
1150   /*********************************************/
1151   /* Now the Transpose of the Upper Tri Factor */
1152   /*********************************************/
1153 
1154   /* allocate space for the transpose of the upper triangular factor */
1155   PetscCall(PetscNew(&upTriFactorT));
1156   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1157 
1158   /* set the matrix descriptors of the upper triangular factor */
1159   matrixType = cusparseGetMatType(upTriFactor->descr);
1160   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1161   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1162   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1163 
1164   /* Create the matrix description */
1165   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1166   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1167   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1168   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1169   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1170 
1171   /* set the operation */
1172   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1173 
1174   /* allocate GPU space for the CSC of the upper triangular factor*/
1175   upTriFactorT->csrMat                 = new CsrMatrix;
1176   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1177   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1178   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1179   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1180   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1181   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1182 
1183   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1184   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1185   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1186                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1187                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1188   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1189   #endif
1190 
1191   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1192   {
1193     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1194     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1195                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1196   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1197                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1198   #else
1199                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1200   #endif
1201     PetscCallCUSPARSE(stat);
1202   }
1203 
1204   PetscCallCUDA(WaitForCUDA());
1205   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1206 
1207   /* Create the solve analysis information */
1208   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1209   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1210   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1211   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1212                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1213   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1214   #endif
1215 
1216   /* perform the solve analysis */
1217   /* christ, would it have killed you to put this stuff in a function????????? */
1218   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1219                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1220 
1221   PetscCallCUDA(WaitForCUDA());
1222   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1223 
1224   /* assign the pointer */
1225   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1226   PetscFunctionReturn(PETSC_SUCCESS);
1227 }
1228 #endif
1229 
1230 struct PetscScalarToPetscInt {
1231   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1232 };
1233 
1234 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1235 {
1236   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1237   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1238   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1239   cusparseStatus_t              stat;
1240   cusparseIndexBase_t           indexBase;
1241 
1242   PetscFunctionBegin;
1243   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1244   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1245   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1246   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1247   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1248   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1249   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1250   PetscCall(PetscLogGpuTimeBegin());
1251   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1252   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1253     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1254     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1255     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1256     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1257     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1258 
1259     /* set alpha and beta */
1260     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1261     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1262     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1263     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1264     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1265     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266 
1267     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1268       CsrMatrix *matrixT      = new CsrMatrix;
1269       matstructT->mat         = matrixT;
1270       matrixT->num_rows       = A->cmap->n;
1271       matrixT->num_cols       = A->rmap->n;
1272       matrixT->num_entries    = a->nz;
1273       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1274       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1275       matrixT->values         = new THRUSTARRAY(a->nz);
1276 
1277       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1278       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1279 
1280 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1281   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1282       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1283                                indexBase, cusparse_scalartype);
1284       PetscCallCUSPARSE(stat);
1285   #else
1286       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1287            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1288 
1289            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1290            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1291            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1292         */
1293       if (matrixT->num_entries) {
1294         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1295         PetscCallCUSPARSE(stat);
1296 
1297       } else {
1298         matstructT->matDescr = NULL;
1299         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1300       }
1301   #endif
1302 #endif
1303     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1304 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1305       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1306 #else
1307       CsrMatrix *temp  = new CsrMatrix;
1308       CsrMatrix *tempT = new CsrMatrix;
1309       /* First convert HYB to CSR */
1310       temp->num_rows       = A->rmap->n;
1311       temp->num_cols       = A->cmap->n;
1312       temp->num_entries    = a->nz;
1313       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1314       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1315       temp->values         = new THRUSTARRAY(a->nz);
1316 
1317       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1318       PetscCallCUSPARSE(stat);
1319 
1320       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1321       tempT->num_rows       = A->rmap->n;
1322       tempT->num_cols       = A->cmap->n;
1323       tempT->num_entries    = a->nz;
1324       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1325       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1326       tempT->values         = new THRUSTARRAY(a->nz);
1327 
1328       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1329                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1330       PetscCallCUSPARSE(stat);
1331 
1332       /* Last, convert CSC to HYB */
1333       cusparseHybMat_t hybMat;
1334       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1335       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1336       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1337       PetscCallCUSPARSE(stat);
1338 
1339       /* assign the pointer */
1340       matstructT->mat = hybMat;
1341       A->transupdated = PETSC_TRUE;
1342       /* delete temporaries */
1343       if (tempT) {
1344         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1345         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1346         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1347         delete (CsrMatrix *)tempT;
1348       }
1349       if (temp) {
1350         if (temp->values) delete (THRUSTARRAY *)temp->values;
1351         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1352         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1353         delete (CsrMatrix *)temp;
1354       }
1355 #endif
1356     }
1357   }
1358   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1359     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1360     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1361     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1362     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1363     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1364     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1365     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1366     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1367     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1368     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1369     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1370       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1371       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1372       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1373     }
1374     if (!cusparsestruct->csr2csc_i) {
1375       THRUSTARRAY csr2csc_a(matrix->num_entries);
1376       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1377 
1378       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380       void  *csr2cscBuffer;
1381       size_t csr2cscBufferSize;
1382       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1383                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1384       PetscCallCUSPARSE(stat);
1385       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1386 #endif
1387 
1388       if (matrix->num_entries) {
1389         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1390            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1391            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1392 
1393            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1394            should be filled with indexBase. So I just take a shortcut here.
1395         */
1396         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1397 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1398                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1399         PetscCallCUSPARSE(stat);
1400 #else
1401                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1402         PetscCallCUSPARSE(stat);
1403 #endif
1404       } else {
1405         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1406       }
1407 
1408       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1409       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1411       PetscCallCUDA(cudaFree(csr2cscBuffer));
1412 #endif
1413     }
1414     PetscCallThrust(
1415       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1416   }
1417   PetscCall(PetscLogGpuTimeEnd());
1418   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1419   /* the compressed row indices is not used for matTranspose */
1420   matstructT->cprowIndices = NULL;
1421   /* assign the pointer */
1422   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1423   A->transupdated                                = PETSC_TRUE;
1424   PetscFunctionReturn(PETSC_SUCCESS);
1425 }
1426 
1427 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1428 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1429 {
1430   const PetscScalar                    *barray;
1431   PetscScalar                          *xarray;
1432   thrust::device_ptr<const PetscScalar> bGPU;
1433   thrust::device_ptr<PetscScalar>       xGPU;
1434   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1435   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1436   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1437   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1438   PetscInt                              m   = A->rmap->n;
1439 
1440   PetscFunctionBegin;
1441   PetscCall(PetscLogGpuTimeBegin());
1442   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1443   PetscCall(VecCUDAGetArrayRead(b, &barray));
1444   xGPU = thrust::device_pointer_cast(xarray);
1445   bGPU = thrust::device_pointer_cast(barray);
1446 
1447   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1448   if (fs->rpermIndices) {
1449     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1450     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1451   } else {
1452     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1453   }
1454 
1455   // Solve L Y = X
1456   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1457   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1458   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1459 
1460   // Solve U X = Y
1461   if (fs->cpermIndices) {
1462     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1463   } else {
1464     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1465   }
1466   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1467 
1468   // Reorder X with the column permutation if needed, and put the result back to x
1469   if (fs->cpermIndices) {
1470     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1471                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1472   }
1473   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1474   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1475   PetscCall(PetscLogGpuTimeEnd());
1476   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1477   PetscFunctionReturn(PETSC_SUCCESS);
1478 }
1479 
1480 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1481 {
1482   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1483   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1484   const PetscScalar                    *barray;
1485   PetscScalar                          *xarray;
1486   thrust::device_ptr<const PetscScalar> bGPU;
1487   thrust::device_ptr<PetscScalar>       xGPU;
1488   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1489   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1490   PetscInt                              m   = A->rmap->n;
1491 
1492   PetscFunctionBegin;
1493   PetscCall(PetscLogGpuTimeBegin());
1494   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1495     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1496     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1497                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1498 
1499     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1500     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1501     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1502     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1503     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1504   }
1505 
1506   if (!fs->updatedTransposeSpSVAnalysis) {
1507     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1508 
1509     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1510     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1511   }
1512 
1513   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1514   PetscCall(VecCUDAGetArrayRead(b, &barray));
1515   xGPU = thrust::device_pointer_cast(xarray);
1516   bGPU = thrust::device_pointer_cast(barray);
1517 
1518   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1519   if (fs->rpermIndices) {
1520     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1521     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1522   } else {
1523     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1524   }
1525 
1526   // Solve Ut Y = X
1527   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1528   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1529 
1530   // Solve Lt X = Y
1531   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1532     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1533   } else {
1534     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1535   }
1536   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1537 
1538   // Reorder X with the column permutation if needed, and put the result back to x
1539   if (fs->cpermIndices) {
1540     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1541                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1542   }
1543 
1544   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1545   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1546   PetscCall(PetscLogGpuTimeEnd());
1547   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1548   PetscFunctionReturn(PETSC_SUCCESS);
1549 }
1550 #else
1551 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1552 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1553 {
1554   PetscInt                              n = xx->map->n;
1555   const PetscScalar                    *barray;
1556   PetscScalar                          *xarray;
1557   thrust::device_ptr<const PetscScalar> bGPU;
1558   thrust::device_ptr<PetscScalar>       xGPU;
1559   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1560   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1561   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1562   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1563 
1564   PetscFunctionBegin;
1565   /* Analyze the matrix and create the transpose ... on the fly */
1566   if (!loTriFactorT && !upTriFactorT) {
1567     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1568     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1569     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1570   }
1571 
1572   /* Get the GPU pointers */
1573   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1574   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1575   xGPU = thrust::device_pointer_cast(xarray);
1576   bGPU = thrust::device_pointer_cast(barray);
1577 
1578   PetscCall(PetscLogGpuTimeBegin());
1579   /* First, reorder with the row permutation */
1580   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1581 
1582   /* First, solve U */
1583   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1584                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1585 
1586   /* Then, solve L */
1587   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1588                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1589 
1590   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1591   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1592 
1593   /* Copy the temporary to the full solution. */
1594   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1595 
1596   /* restore */
1597   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1598   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1599   PetscCall(PetscLogGpuTimeEnd());
1600   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1601   PetscFunctionReturn(PETSC_SUCCESS);
1602 }
1603 
1604 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1605 {
1606   const PetscScalar                 *barray;
1607   PetscScalar                       *xarray;
1608   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1609   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1610   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1611   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1612 
1613   PetscFunctionBegin;
1614   /* Analyze the matrix and create the transpose ... on the fly */
1615   if (!loTriFactorT && !upTriFactorT) {
1616     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1617     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1618     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1619   }
1620 
1621   /* Get the GPU pointers */
1622   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1623   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1624 
1625   PetscCall(PetscLogGpuTimeBegin());
1626   /* First, solve U */
1627   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1628                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1629 
1630   /* Then, solve L */
1631   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1632                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1633 
1634   /* restore */
1635   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1636   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1637   PetscCall(PetscLogGpuTimeEnd());
1638   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1639   PetscFunctionReturn(PETSC_SUCCESS);
1640 }
1641 
1642 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1643 {
1644   const PetscScalar                    *barray;
1645   PetscScalar                          *xarray;
1646   thrust::device_ptr<const PetscScalar> bGPU;
1647   thrust::device_ptr<PetscScalar>       xGPU;
1648   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1649   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1650   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1651   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1652 
1653   PetscFunctionBegin;
1654   /* Get the GPU pointers */
1655   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1656   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1657   xGPU = thrust::device_pointer_cast(xarray);
1658   bGPU = thrust::device_pointer_cast(barray);
1659 
1660   PetscCall(PetscLogGpuTimeBegin());
1661   /* First, reorder with the row permutation */
1662   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1663 
1664   /* Next, solve L */
1665   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1666                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1667 
1668   /* Then, solve U */
1669   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1670                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1671 
1672   /* Last, reorder with the column permutation */
1673   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1674 
1675   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1676   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1677   PetscCall(PetscLogGpuTimeEnd());
1678   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1679   PetscFunctionReturn(PETSC_SUCCESS);
1680 }
1681 
1682 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1683 {
1684   const PetscScalar                 *barray;
1685   PetscScalar                       *xarray;
1686   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1687   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1688   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1689   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1690 
1691   PetscFunctionBegin;
1692   /* Get the GPU pointers */
1693   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1694   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1695 
1696   PetscCall(PetscLogGpuTimeBegin());
1697   /* First, solve L */
1698   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1699                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1700 
1701   /* Next, solve U */
1702   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1703                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1704 
1705   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1706   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1707   PetscCall(PetscLogGpuTimeEnd());
1708   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1709   PetscFunctionReturn(PETSC_SUCCESS);
1710 }
1711 #endif
1712 
1713 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1714 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1715 {
1716   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1717   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1718   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1719   CsrMatrix                    *Acsr;
1720   PetscInt                      m, nz;
1721   PetscBool                     flg;
1722 
1723   PetscFunctionBegin;
1724   if (PetscDefined(USE_DEBUG)) {
1725     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1726     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1727   }
1728 
1729   /* Copy A's value to fact */
1730   m  = fact->rmap->n;
1731   nz = aij->nz;
1732   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1733   Acsr = (CsrMatrix *)Acusp->mat->mat;
1734   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1735 
1736   PetscCall(PetscLogGpuTimeBegin());
1737   /* Factorize fact inplace */
1738   if (m)
1739     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1740                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1741   if (PetscDefined(USE_DEBUG)) {
1742     int              numerical_zero;
1743     cusparseStatus_t status;
1744     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1745     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1746   }
1747 
1748   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1749   if (fs->updatedSpSVAnalysis) {
1750     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1751     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1752   } else
1753   #endif
1754   {
1755     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1756      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1757     */
1758     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1759 
1760     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1761 
1762     fs->updatedSpSVAnalysis = PETSC_TRUE;
1763     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1764     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1765   }
1766 
1767   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1768   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1769   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1770   fact->ops->matsolve          = NULL;
1771   fact->ops->matsolvetranspose = NULL;
1772   PetscCall(PetscLogGpuTimeEnd());
1773   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1774   PetscFunctionReturn(PETSC_SUCCESS);
1775 }
1776 
1777 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1778 {
1779   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1780   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1781   PetscInt                      m, nz;
1782 
1783   PetscFunctionBegin;
1784   if (PetscDefined(USE_DEBUG)) {
1785     PetscInt  i;
1786     PetscBool flg, missing;
1787 
1788     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1789     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1790     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1791     PetscCall(MatMissingDiagonal(A, &missing, &i));
1792     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1793   }
1794 
1795   /* Free the old stale stuff */
1796   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1797 
1798   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1799      but they will not be used. Allocate them just for easy debugging.
1800    */
1801   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1802 
1803   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1804   fact->factortype             = MAT_FACTOR_ILU;
1805   fact->info.factor_mallocs    = 0;
1806   fact->info.fill_ratio_given  = info->fill;
1807   fact->info.fill_ratio_needed = 1.0;
1808 
1809   aij->row = NULL;
1810   aij->col = NULL;
1811 
1812   /* ====================================================================== */
1813   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1814   /* We'll do in-place factorization on fact                                */
1815   /* ====================================================================== */
1816   const int *Ai, *Aj;
1817 
1818   m  = fact->rmap->n;
1819   nz = aij->nz;
1820 
1821   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1822   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1823   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1824   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1825   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1826   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1827 
1828   /* ====================================================================== */
1829   /* Create descriptors for M, L, U                                         */
1830   /* ====================================================================== */
1831   cusparseFillMode_t fillMode;
1832   cusparseDiagType_t diagType;
1833 
1834   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1835   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1836   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1837 
1838   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1839     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1840     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1841     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1842     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1843   */
1844   fillMode = CUSPARSE_FILL_MODE_LOWER;
1845   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1846   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1847   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1848   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1849 
1850   fillMode = CUSPARSE_FILL_MODE_UPPER;
1851   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1852   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1853   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1854   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1855 
1856   /* ========================================================================= */
1857   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1858   /* ========================================================================= */
1859   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1860   if (m)
1861     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1862                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1863 
1864   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1865   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1866 
1867   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1868   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1869 
1870   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1871   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1872 
1873   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1874   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1875 
1876   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1877      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1878      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1879      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1880    */
1881   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1882     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1883     fs->spsvBuffer_L = fs->factBuffer_M;
1884     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1885   } else {
1886     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1887     fs->spsvBuffer_U = fs->factBuffer_M;
1888     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1889   }
1890 
1891   /* ========================================================================== */
1892   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1893   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1894   /* ========================================================================== */
1895   int              structural_zero;
1896   cusparseStatus_t status;
1897 
1898   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1899   if (m)
1900     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1901                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1902   if (PetscDefined(USE_DEBUG)) {
1903     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1904     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1905     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1906   }
1907 
1908   /* Estimate FLOPs of the numeric factorization */
1909   {
1910     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1911     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1912     PetscLogDouble flops = 0.0;
1913 
1914     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1915     Ai    = Aseq->i;
1916     Adiag = Aseq->diag;
1917     for (PetscInt i = 0; i < m; i++) {
1918       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1919         nzRow  = Ai[i + 1] - Ai[i];
1920         nzLeft = Adiag[i] - Ai[i];
1921         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1922           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1923         */
1924         nzLeft = (nzRow - 1) / 2;
1925         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1926       }
1927     }
1928     fs->numericFactFlops = flops;
1929   }
1930   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1931   PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933 
1934 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1935 {
1936   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1937   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1938   const PetscScalar            *barray;
1939   PetscScalar                  *xarray;
1940 
1941   PetscFunctionBegin;
1942   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1943   PetscCall(VecCUDAGetArrayRead(b, &barray));
1944   PetscCall(PetscLogGpuTimeBegin());
1945 
1946   /* Solve L*y = b */
1947   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1948   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1949   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1950                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1951 
1952   /* Solve Lt*x = y */
1953   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1954   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1955                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1956 
1957   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1958   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1959 
1960   PetscCall(PetscLogGpuTimeEnd());
1961   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1962   PetscFunctionReturn(PETSC_SUCCESS);
1963 }
1964 
1965 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1966 {
1967   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1968   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1969   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1970   CsrMatrix                    *Acsr;
1971   PetscInt                      m, nz;
1972   PetscBool                     flg;
1973 
1974   PetscFunctionBegin;
1975   if (PetscDefined(USE_DEBUG)) {
1976     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1977     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1978   }
1979 
1980   /* Copy A's value to fact */
1981   m  = fact->rmap->n;
1982   nz = aij->nz;
1983   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1984   Acsr = (CsrMatrix *)Acusp->mat->mat;
1985   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1986 
1987   /* Factorize fact inplace */
1988   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1989      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1990      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1991      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1992      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1993    */
1994   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1995   if (PetscDefined(USE_DEBUG)) {
1996     int              numerical_zero;
1997     cusparseStatus_t status;
1998     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1999     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2000   }
2001 
2002   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2003   if (fs->updatedSpSVAnalysis) {
2004     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2005     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2006   } else
2007   #endif
2008   {
2009     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2010 
2011     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2012     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2013   */
2014     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2015     fs->updatedSpSVAnalysis = PETSC_TRUE;
2016   }
2017 
2018   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2019   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2020   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2021   fact->ops->matsolve          = NULL;
2022   fact->ops->matsolvetranspose = NULL;
2023   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2024   PetscFunctionReturn(PETSC_SUCCESS);
2025 }
2026 
2027 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2028 {
2029   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2030   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2031   PetscInt                      m, nz;
2032 
2033   PetscFunctionBegin;
2034   if (PetscDefined(USE_DEBUG)) {
2035     PetscInt  i;
2036     PetscBool flg, missing;
2037 
2038     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2039     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2040     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2041     PetscCall(MatMissingDiagonal(A, &missing, &i));
2042     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2043   }
2044 
2045   /* Free the old stale stuff */
2046   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2047 
2048   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2049      but they will not be used. Allocate them just for easy debugging.
2050    */
2051   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2052 
2053   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2054   fact->factortype             = MAT_FACTOR_ICC;
2055   fact->info.factor_mallocs    = 0;
2056   fact->info.fill_ratio_given  = info->fill;
2057   fact->info.fill_ratio_needed = 1.0;
2058 
2059   aij->row = NULL;
2060   aij->col = NULL;
2061 
2062   /* ====================================================================== */
2063   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2064   /* We'll do in-place factorization on fact                                */
2065   /* ====================================================================== */
2066   const int *Ai, *Aj;
2067 
2068   m  = fact->rmap->n;
2069   nz = aij->nz;
2070 
2071   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2072   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2073   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2074   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2075   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2076   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2077 
2078   /* ====================================================================== */
2079   /* Create mat descriptors for M, L                                        */
2080   /* ====================================================================== */
2081   cusparseFillMode_t fillMode;
2082   cusparseDiagType_t diagType;
2083 
2084   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2085   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2086   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2087 
2088   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2089     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2090     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2091     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2092     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2093   */
2094   fillMode = CUSPARSE_FILL_MODE_LOWER;
2095   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2096   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2097   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2098   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2099 
2100   /* ========================================================================= */
2101   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2102   /* ========================================================================= */
2103   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2104   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2105 
2106   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2107   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2108 
2109   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2110   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2111 
2112   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2113   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2114 
2115   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2116   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2117 
2118   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2119      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2120    */
2121   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2122     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2123     fs->spsvBuffer_L = fs->factBuffer_M;
2124     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2125   } else {
2126     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2127     fs->spsvBuffer_Lt = fs->factBuffer_M;
2128     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2129   }
2130 
2131   /* ========================================================================== */
2132   /* Perform analysis of ic0 on M                                               */
2133   /* The lower triangular part of M has the same sparsity pattern as L          */
2134   /* ========================================================================== */
2135   int              structural_zero;
2136   cusparseStatus_t status;
2137 
2138   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2139   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2140   if (PetscDefined(USE_DEBUG)) {
2141     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2142     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2143     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2144   }
2145 
2146   /* Estimate FLOPs of the numeric factorization */
2147   {
2148     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2149     PetscInt      *Ai, nzRow, nzLeft;
2150     PetscLogDouble flops = 0.0;
2151 
2152     Ai = Aseq->i;
2153     for (PetscInt i = 0; i < m; i++) {
2154       nzRow = Ai[i + 1] - Ai[i];
2155       if (nzRow > 1) {
2156         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2157           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2158         */
2159         nzLeft = (nzRow - 1) / 2;
2160         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2161       }
2162     }
2163     fs->numericFactFlops = flops;
2164   }
2165   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2166   PetscFunctionReturn(PETSC_SUCCESS);
2167 }
2168 #endif
2169 
2170 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2171 {
2172   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2173   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2174 
2175   PetscFunctionBegin;
2176   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2177   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2178   B->offloadmask = PETSC_OFFLOAD_CPU;
2179 
2180   if (!cusparsestruct->use_cpu_solve) {
2181 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2182     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2183     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2184 #else
2185     /* determine which version of MatSolve needs to be used. */
2186     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2187     IS          isrow = b->row, iscol = b->col;
2188     PetscBool   row_identity, col_identity;
2189 
2190     PetscCall(ISIdentity(isrow, &row_identity));
2191     PetscCall(ISIdentity(iscol, &col_identity));
2192     if (row_identity && col_identity) {
2193       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2194       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2195     } else {
2196       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2197       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2198     }
2199 #endif
2200   }
2201   B->ops->matsolve          = NULL;
2202   B->ops->matsolvetranspose = NULL;
2203 
2204   /* get the triangular factors */
2205   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2206   PetscFunctionReturn(PETSC_SUCCESS);
2207 }
2208 
2209 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2210 {
2211   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2212 
2213   PetscFunctionBegin;
2214   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2215   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2216   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2217   PetscFunctionReturn(PETSC_SUCCESS);
2218 }
2219 
2220 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2221 {
2222   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2223 
2224   PetscFunctionBegin;
2225 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2226   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2227   if (!info->factoronhost) {
2228     PetscCall(ISIdentity(isrow, &row_identity));
2229     PetscCall(ISIdentity(iscol, &col_identity));
2230   }
2231   if (!info->levels && row_identity && col_identity) {
2232     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2233   } else
2234 #endif
2235   {
2236     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2237     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2238     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2239   }
2240   PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242 
2243 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2244 {
2245   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2246 
2247   PetscFunctionBegin;
2248 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2249   PetscBool perm_identity = PETSC_FALSE;
2250   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2251   if (!info->levels && perm_identity) {
2252     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2253   } else
2254 #endif
2255   {
2256     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2257     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2258     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2259   }
2260   PetscFunctionReturn(PETSC_SUCCESS);
2261 }
2262 
2263 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2264 {
2265   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2266 
2267   PetscFunctionBegin;
2268   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2269   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2270   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2271   PetscFunctionReturn(PETSC_SUCCESS);
2272 }
2273 
2274 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2275 {
2276   PetscFunctionBegin;
2277   *type = MATSOLVERCUSPARSE;
2278   PetscFunctionReturn(PETSC_SUCCESS);
2279 }
2280 
2281 /*MC
2282   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2283   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2284   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2285   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2286   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2287   algorithms are not recommended. This class does NOT support direct solver operations.
2288 
2289   Level: beginner
2290 
2291 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2292           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2293 M*/
2294 
2295 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2296 {
2297   PetscInt n = A->rmap->n;
2298 
2299   PetscFunctionBegin;
2300   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2301   PetscCall(MatSetSizes(*B, n, n, n, n));
2302   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2303   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2304 
2305   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2306   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2307     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2308     if (!A->boundtocpu) {
2309       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2310       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2311     } else {
2312       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2313       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2314     }
2315     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2316     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2317     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2318   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2319     if (!A->boundtocpu) {
2320       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2321       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2322     } else {
2323       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2324       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2325     }
2326     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2327     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2328   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2329 
2330   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2331   (*B)->canuseordering = PETSC_TRUE;
2332   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2333   PetscFunctionReturn(PETSC_SUCCESS);
2334 }
2335 
2336 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2337 {
2338   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2339   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2340 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2341   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2342 #endif
2343 
2344   PetscFunctionBegin;
2345   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2346     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2347     if (A->factortype == MAT_FACTOR_NONE) {
2348       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2349       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2350     }
2351 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2352     else if (fs->csrVal) {
2353       /* We have a factorized matrix on device and are able to copy it to host */
2354       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2355     }
2356 #endif
2357     else
2358       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2359     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2360     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2361     A->offloadmask = PETSC_OFFLOAD_BOTH;
2362   }
2363   PetscFunctionReturn(PETSC_SUCCESS);
2364 }
2365 
2366 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2367 {
2368   PetscFunctionBegin;
2369   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2370   *array = ((Mat_SeqAIJ *)A->data)->a;
2371   PetscFunctionReturn(PETSC_SUCCESS);
2372 }
2373 
2374 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375 {
2376   PetscFunctionBegin;
2377   A->offloadmask = PETSC_OFFLOAD_CPU;
2378   *array         = NULL;
2379   PetscFunctionReturn(PETSC_SUCCESS);
2380 }
2381 
2382 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2383 {
2384   PetscFunctionBegin;
2385   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2386   *array = ((Mat_SeqAIJ *)A->data)->a;
2387   PetscFunctionReturn(PETSC_SUCCESS);
2388 }
2389 
2390 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2391 {
2392   PetscFunctionBegin;
2393   *array = NULL;
2394   PetscFunctionReturn(PETSC_SUCCESS);
2395 }
2396 
2397 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2398 {
2399   PetscFunctionBegin;
2400   *array = ((Mat_SeqAIJ *)A->data)->a;
2401   PetscFunctionReturn(PETSC_SUCCESS);
2402 }
2403 
2404 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2405 {
2406   PetscFunctionBegin;
2407   A->offloadmask = PETSC_OFFLOAD_CPU;
2408   *array         = NULL;
2409   PetscFunctionReturn(PETSC_SUCCESS);
2410 }
2411 
2412 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2413 {
2414   Mat_SeqAIJCUSPARSE *cusp;
2415   CsrMatrix          *matrix;
2416 
2417   PetscFunctionBegin;
2418   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2419   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2420   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2421   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2422   matrix = (CsrMatrix *)cusp->mat->mat;
2423 
2424   if (i) {
2425 #if !defined(PETSC_USE_64BIT_INDICES)
2426     *i = matrix->row_offsets->data().get();
2427 #else
2428     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2429 #endif
2430   }
2431   if (j) {
2432 #if !defined(PETSC_USE_64BIT_INDICES)
2433     *j = matrix->column_indices->data().get();
2434 #else
2435     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2436 #endif
2437   }
2438   if (a) *a = matrix->values->data().get();
2439   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2440   PetscFunctionReturn(PETSC_SUCCESS);
2441 }
2442 
2443 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2444 {
2445   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2446   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2447   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2448   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2449   cusparseStatus_t              stat;
2450   PetscBool                     both = PETSC_TRUE;
2451 
2452   PetscFunctionBegin;
2453   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2454   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2455     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2456       CsrMatrix *matrix;
2457       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2458 
2459       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2460       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2461       matrix->values->assign(a->a, a->a + a->nz);
2462       PetscCallCUDA(WaitForCUDA());
2463       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2464       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2465       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2466     } else {
2467       PetscInt nnz;
2468       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2469       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2470       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2471       delete cusparsestruct->workVector;
2472       delete cusparsestruct->rowoffsets_gpu;
2473       cusparsestruct->workVector     = NULL;
2474       cusparsestruct->rowoffsets_gpu = NULL;
2475       try {
2476         if (a->compressedrow.use) {
2477           m    = a->compressedrow.nrows;
2478           ii   = a->compressedrow.i;
2479           ridx = a->compressedrow.rindex;
2480         } else {
2481           m    = A->rmap->n;
2482           ii   = a->i;
2483           ridx = NULL;
2484         }
2485         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2486         if (!a->a) {
2487           nnz  = ii[m];
2488           both = PETSC_FALSE;
2489         } else nnz = a->nz;
2490         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2491 
2492         /* create cusparse matrix */
2493         cusparsestruct->nrows = m;
2494         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2495         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2496         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2497         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2498 
2499         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2500         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2501         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2502         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2503         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2504         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2506 
2507         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2508         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2509           /* set the matrix */
2510           CsrMatrix *mat   = new CsrMatrix;
2511           mat->num_rows    = m;
2512           mat->num_cols    = A->cmap->n;
2513           mat->num_entries = nnz;
2514           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2515           mat->row_offsets->assign(ii, ii + m + 1);
2516 
2517           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2518           mat->column_indices->assign(a->j, a->j + nnz);
2519 
2520           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2521           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522 
2523           /* assign the pointer */
2524           matstruct->mat = mat;
2525 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2526           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2527             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2528                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2529             PetscCallCUSPARSE(stat);
2530           }
2531 #endif
2532         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2533 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2534           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2535 #else
2536           CsrMatrix *mat   = new CsrMatrix;
2537           mat->num_rows    = m;
2538           mat->num_cols    = A->cmap->n;
2539           mat->num_entries = nnz;
2540           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2541           mat->row_offsets->assign(ii, ii + m + 1);
2542 
2543           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2544           mat->column_indices->assign(a->j, a->j + nnz);
2545 
2546           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2547           if (a->a) mat->values->assign(a->a, a->a + nnz);
2548 
2549           cusparseHybMat_t hybMat;
2550           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2551           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2552           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2553           PetscCallCUSPARSE(stat);
2554           /* assign the pointer */
2555           matstruct->mat = hybMat;
2556 
2557           if (mat) {
2558             if (mat->values) delete (THRUSTARRAY *)mat->values;
2559             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2560             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2561             delete (CsrMatrix *)mat;
2562           }
2563 #endif
2564         }
2565 
2566         /* assign the compressed row indices */
2567         if (a->compressedrow.use) {
2568           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2569           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2570           matstruct->cprowIndices->assign(ridx, ridx + m);
2571           tmp = m;
2572         } else {
2573           cusparsestruct->workVector = NULL;
2574           matstruct->cprowIndices    = NULL;
2575           tmp                        = 0;
2576         }
2577         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2578 
2579         /* assign the pointer */
2580         cusparsestruct->mat = matstruct;
2581       } catch (char *ex) {
2582         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2583       }
2584       PetscCallCUDA(WaitForCUDA());
2585       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2586       cusparsestruct->nonzerostate = A->nonzerostate;
2587     }
2588     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2589   }
2590   PetscFunctionReturn(PETSC_SUCCESS);
2591 }
2592 
2593 struct VecCUDAPlusEquals {
2594   template <typename Tuple>
2595   __host__ __device__ void operator()(Tuple t)
2596   {
2597     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2598   }
2599 };
2600 
2601 struct VecCUDAEquals {
2602   template <typename Tuple>
2603   __host__ __device__ void operator()(Tuple t)
2604   {
2605     thrust::get<1>(t) = thrust::get<0>(t);
2606   }
2607 };
2608 
2609 struct VecCUDAEqualsReverse {
2610   template <typename Tuple>
2611   __host__ __device__ void operator()(Tuple t)
2612   {
2613     thrust::get<0>(t) = thrust::get<1>(t);
2614   }
2615 };
2616 
2617 struct MatMatCusparse {
2618   PetscBool      cisdense;
2619   PetscScalar   *Bt;
2620   Mat            X;
2621   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2622   PetscLogDouble flops;
2623   CsrMatrix     *Bcsr;
2624 
2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2626   cusparseSpMatDescr_t matSpBDescr;
2627   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2628   cusparseDnMatDescr_t matBDescr;
2629   cusparseDnMatDescr_t matCDescr;
2630   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2631   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2632   void *dBuffer4;
2633   void *dBuffer5;
2634   #endif
2635   size_t                mmBufferSize;
2636   void                 *mmBuffer;
2637   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2638   cusparseSpGEMMDescr_t spgemmDesc;
2639 #endif
2640 };
2641 
2642 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2643 {
2644   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2645 
2646   PetscFunctionBegin;
2647   PetscCallCUDA(cudaFree(mmdata->Bt));
2648   delete mmdata->Bcsr;
2649 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2650   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2651   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2652   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2653   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2654   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2655   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2656   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2657   #endif
2658   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2659   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2660 #endif
2661   PetscCall(MatDestroy(&mmdata->X));
2662   PetscCall(PetscFree(data));
2663   PetscFunctionReturn(PETSC_SUCCESS);
2664 }
2665 
2666 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2667 
2668 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2669 {
2670   Mat_Product                  *product = C->product;
2671   Mat                           A, B;
2672   PetscInt                      m, n, blda, clda;
2673   PetscBool                     flg, biscuda;
2674   Mat_SeqAIJCUSPARSE           *cusp;
2675   cusparseStatus_t              stat;
2676   cusparseOperation_t           opA;
2677   const PetscScalar            *barray;
2678   PetscScalar                  *carray;
2679   MatMatCusparse               *mmdata;
2680   Mat_SeqAIJCUSPARSEMultStruct *mat;
2681   CsrMatrix                    *csrmat;
2682 
2683   PetscFunctionBegin;
2684   MatCheckProduct(C, 1);
2685   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2686   mmdata = (MatMatCusparse *)product->data;
2687   A      = product->A;
2688   B      = product->B;
2689   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2690   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2691   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2692      Instead of silently accepting the wrong answer, I prefer to raise the error */
2693   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2694   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2695   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2696   switch (product->type) {
2697   case MATPRODUCT_AB:
2698   case MATPRODUCT_PtAP:
2699     mat = cusp->mat;
2700     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2701     m   = A->rmap->n;
2702     n   = B->cmap->n;
2703     break;
2704   case MATPRODUCT_AtB:
2705     if (!A->form_explicit_transpose) {
2706       mat = cusp->mat;
2707       opA = CUSPARSE_OPERATION_TRANSPOSE;
2708     } else {
2709       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2710       mat = cusp->matTranspose;
2711       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2712     }
2713     m = A->cmap->n;
2714     n = B->cmap->n;
2715     break;
2716   case MATPRODUCT_ABt:
2717   case MATPRODUCT_RARt:
2718     mat = cusp->mat;
2719     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2720     m   = A->rmap->n;
2721     n   = B->rmap->n;
2722     break;
2723   default:
2724     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2725   }
2726   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2727   csrmat = (CsrMatrix *)mat->mat;
2728   /* if the user passed a CPU matrix, copy the data to the GPU */
2729   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2730   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2731   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2732 
2733   PetscCall(MatDenseGetLDA(B, &blda));
2734   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2735     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2736     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2737   } else {
2738     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2739     PetscCall(MatDenseGetLDA(C, &clda));
2740   }
2741 
2742   PetscCall(PetscLogGpuTimeBegin());
2743 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2744   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2745   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2746   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2747   #else
2748   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2749   #endif
2750 
2751   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2752   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2753     size_t mmBufferSize;
2754     if (mmdata->initialized && mmdata->Blda != blda) {
2755       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2756       mmdata->matBDescr = NULL;
2757     }
2758     if (!mmdata->matBDescr) {
2759       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2760       mmdata->Blda = blda;
2761     }
2762 
2763     if (mmdata->initialized && mmdata->Clda != clda) {
2764       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2765       mmdata->matCDescr = NULL;
2766     }
2767     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2768       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2769       mmdata->Clda = clda;
2770     }
2771 
2772   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2773     if (matADescr) {
2774       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2775       matADescr = NULL;
2776     }
2777   #endif
2778 
2779     if (!matADescr) {
2780       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2781                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2782       PetscCallCUSPARSE(stat);
2783     }
2784 
2785     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2786 
2787     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2788       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2789       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2790       mmdata->mmBufferSize = mmBufferSize;
2791     }
2792 
2793   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2794     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2795   #endif
2796 
2797     mmdata->initialized = PETSC_TRUE;
2798   } else {
2799     /* to be safe, always update pointers of the mats */
2800     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2801     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2802     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2803   }
2804 
2805   /* do cusparseSpMM, which supports transpose on B */
2806   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2807 #else
2808   PetscInt k;
2809   /* cusparseXcsrmm does not support transpose on B */
2810   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2811     cublasHandle_t cublasv2handle;
2812     cublasStatus_t cerr;
2813 
2814     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2815     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2816     PetscCallCUBLAS(cerr);
2817     blda = B->cmap->n;
2818     k    = B->cmap->n;
2819   } else {
2820     k = B->rmap->n;
2821   }
2822 
2823   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2824   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2825   PetscCallCUSPARSE(stat);
2826 #endif
2827   PetscCall(PetscLogGpuTimeEnd());
2828   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2829   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2830   if (product->type == MATPRODUCT_RARt) {
2831     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2832     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2833   } else if (product->type == MATPRODUCT_PtAP) {
2834     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2835     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2836   } else {
2837     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2838   }
2839   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2840   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2841   PetscFunctionReturn(PETSC_SUCCESS);
2842 }
2843 
2844 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2845 {
2846   Mat_Product        *product = C->product;
2847   Mat                 A, B;
2848   PetscInt            m, n;
2849   PetscBool           cisdense, flg;
2850   MatMatCusparse     *mmdata;
2851   Mat_SeqAIJCUSPARSE *cusp;
2852 
2853   PetscFunctionBegin;
2854   MatCheckProduct(C, 1);
2855   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2856   A = product->A;
2857   B = product->B;
2858   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2859   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2860   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2861   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2862   switch (product->type) {
2863   case MATPRODUCT_AB:
2864     m = A->rmap->n;
2865     n = B->cmap->n;
2866     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2867     break;
2868   case MATPRODUCT_AtB:
2869     m = A->cmap->n;
2870     n = B->cmap->n;
2871     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2872     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2873     break;
2874   case MATPRODUCT_ABt:
2875     m = A->rmap->n;
2876     n = B->rmap->n;
2877     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2878     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2879     break;
2880   case MATPRODUCT_PtAP:
2881     m = B->cmap->n;
2882     n = B->cmap->n;
2883     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2884     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2885     break;
2886   case MATPRODUCT_RARt:
2887     m = B->rmap->n;
2888     n = B->rmap->n;
2889     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2890     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2891     break;
2892   default:
2893     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2894   }
2895   PetscCall(MatSetSizes(C, m, n, m, n));
2896   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2897   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2898   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2899 
2900   /* product data */
2901   PetscCall(PetscNew(&mmdata));
2902   mmdata->cisdense = cisdense;
2903 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2904   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2905   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2906 #endif
2907   /* for these products we need intermediate storage */
2908   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2909     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2910     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2911     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2912       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2913     } else {
2914       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2915     }
2916   }
2917   C->product->data    = mmdata;
2918   C->product->destroy = MatDestroy_MatMatCusparse;
2919 
2920   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2921   PetscFunctionReturn(PETSC_SUCCESS);
2922 }
2923 
2924 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2925 {
2926   Mat_Product                  *product = C->product;
2927   Mat                           A, B;
2928   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2929   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2930   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2931   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2932   PetscBool                     flg;
2933   cusparseStatus_t              stat;
2934   MatProductType                ptype;
2935   MatMatCusparse               *mmdata;
2936 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2937   cusparseSpMatDescr_t BmatSpDescr;
2938 #endif
2939   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2940 
2941   PetscFunctionBegin;
2942   MatCheckProduct(C, 1);
2943   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2944   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2945   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2946   mmdata = (MatMatCusparse *)C->product->data;
2947   A      = product->A;
2948   B      = product->B;
2949   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2950     mmdata->reusesym = PETSC_FALSE;
2951     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2952     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2953     Cmat = Ccusp->mat;
2954     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2955     Ccsr = (CsrMatrix *)Cmat->mat;
2956     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957     goto finalize;
2958   }
2959   if (!c->nz) goto finalize;
2960   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2961   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2962   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2963   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2964   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2965   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2966   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2967   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2968   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2969   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2970   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2971   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2973   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2974 
2975   ptype = product->type;
2976   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2977     ptype = MATPRODUCT_AB;
2978     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2979   }
2980   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2981     ptype = MATPRODUCT_AB;
2982     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2983   }
2984   switch (ptype) {
2985   case MATPRODUCT_AB:
2986     Amat = Acusp->mat;
2987     Bmat = Bcusp->mat;
2988     break;
2989   case MATPRODUCT_AtB:
2990     Amat = Acusp->matTranspose;
2991     Bmat = Bcusp->mat;
2992     break;
2993   case MATPRODUCT_ABt:
2994     Amat = Acusp->mat;
2995     Bmat = Bcusp->matTranspose;
2996     break;
2997   default:
2998     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2999   }
3000   Cmat = Ccusp->mat;
3001   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3002   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3003   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3004   Acsr = (CsrMatrix *)Amat->mat;
3005   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3006   Ccsr = (CsrMatrix *)Cmat->mat;
3007   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3008   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3009   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3010   PetscCall(PetscLogGpuTimeBegin());
3011 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3012   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3013   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3014   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3015   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3016   PetscCallCUSPARSE(stat);
3017   #else
3018   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3019   PetscCallCUSPARSE(stat);
3020   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3021   PetscCallCUSPARSE(stat);
3022   #endif
3023 #else
3024   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3025                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3026   PetscCallCUSPARSE(stat);
3027 #endif
3028   PetscCall(PetscLogGpuFlops(mmdata->flops));
3029   PetscCallCUDA(WaitForCUDA());
3030   PetscCall(PetscLogGpuTimeEnd());
3031   C->offloadmask = PETSC_OFFLOAD_GPU;
3032 finalize:
3033   /* shorter version of MatAssemblyEnd_SeqAIJ */
3034   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3035   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3036   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3037   c->reallocs = 0;
3038   C->info.mallocs += 0;
3039   C->info.nz_unneeded = 0;
3040   C->assembled = C->was_assembled = PETSC_TRUE;
3041   C->num_ass++;
3042   PetscFunctionReturn(PETSC_SUCCESS);
3043 }
3044 
3045 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3046 {
3047   Mat_Product                  *product = C->product;
3048   Mat                           A, B;
3049   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3050   Mat_SeqAIJ                   *a, *b, *c;
3051   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3052   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3053   PetscInt                      i, j, m, n, k;
3054   PetscBool                     flg;
3055   cusparseStatus_t              stat;
3056   MatProductType                ptype;
3057   MatMatCusparse               *mmdata;
3058   PetscLogDouble                flops;
3059   PetscBool                     biscompressed, ciscompressed;
3060 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3061   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3062   cusparseSpMatDescr_t BmatSpDescr;
3063 #else
3064   int cnz;
3065 #endif
3066   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3067 
3068   PetscFunctionBegin;
3069   MatCheckProduct(C, 1);
3070   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3071   A = product->A;
3072   B = product->B;
3073   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3074   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3075   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3076   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3077   a = (Mat_SeqAIJ *)A->data;
3078   b = (Mat_SeqAIJ *)B->data;
3079   /* product data */
3080   PetscCall(PetscNew(&mmdata));
3081   C->product->data    = mmdata;
3082   C->product->destroy = MatDestroy_MatMatCusparse;
3083 
3084   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3085   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3086   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3087   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3088   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3089   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3090 
3091   ptype = product->type;
3092   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3093     ptype                                          = MATPRODUCT_AB;
3094     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3095   }
3096   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3097     ptype                                          = MATPRODUCT_AB;
3098     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3099   }
3100   biscompressed = PETSC_FALSE;
3101   ciscompressed = PETSC_FALSE;
3102   switch (ptype) {
3103   case MATPRODUCT_AB:
3104     m    = A->rmap->n;
3105     n    = B->cmap->n;
3106     k    = A->cmap->n;
3107     Amat = Acusp->mat;
3108     Bmat = Bcusp->mat;
3109     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3110     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3111     break;
3112   case MATPRODUCT_AtB:
3113     m = A->cmap->n;
3114     n = B->cmap->n;
3115     k = A->rmap->n;
3116     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3117     Amat = Acusp->matTranspose;
3118     Bmat = Bcusp->mat;
3119     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3120     break;
3121   case MATPRODUCT_ABt:
3122     m = A->rmap->n;
3123     n = B->rmap->n;
3124     k = A->cmap->n;
3125     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3126     Amat = Acusp->mat;
3127     Bmat = Bcusp->matTranspose;
3128     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3129     break;
3130   default:
3131     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3132   }
3133 
3134   /* create cusparse matrix */
3135   PetscCall(MatSetSizes(C, m, n, m, n));
3136   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3137   c     = (Mat_SeqAIJ *)C->data;
3138   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3139   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3140   Ccsr  = new CsrMatrix;
3141 
3142   c->compressedrow.use = ciscompressed;
3143   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3144     c->compressedrow.nrows = a->compressedrow.nrows;
3145     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3146     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3147     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3148     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3149     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3150   } else {
3151     c->compressedrow.nrows  = 0;
3152     c->compressedrow.i      = NULL;
3153     c->compressedrow.rindex = NULL;
3154     Ccusp->workVector       = NULL;
3155     Cmat->cprowIndices      = NULL;
3156   }
3157   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3158   Ccusp->mat        = Cmat;
3159   Ccusp->mat->mat   = Ccsr;
3160   Ccsr->num_rows    = Ccusp->nrows;
3161   Ccsr->num_cols    = n;
3162   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3163   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3164   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3165   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3166   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3167   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3168   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3169   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3170   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3171   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3173     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3174     c->nz                = 0;
3175     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3176     Ccsr->values         = new THRUSTARRAY(c->nz);
3177     goto finalizesym;
3178   }
3179 
3180   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3181   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3182   Acsr = (CsrMatrix *)Amat->mat;
3183   if (!biscompressed) {
3184     Bcsr = (CsrMatrix *)Bmat->mat;
3185 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3186     BmatSpDescr = Bmat->matDescr;
3187 #endif
3188   } else { /* we need to use row offsets for the full matrix */
3189     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3190     Bcsr                 = new CsrMatrix;
3191     Bcsr->num_rows       = B->rmap->n;
3192     Bcsr->num_cols       = cBcsr->num_cols;
3193     Bcsr->num_entries    = cBcsr->num_entries;
3194     Bcsr->column_indices = cBcsr->column_indices;
3195     Bcsr->values         = cBcsr->values;
3196     if (!Bcusp->rowoffsets_gpu) {
3197       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3198       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3199       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3200     }
3201     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3202     mmdata->Bcsr      = Bcsr;
3203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3204     if (Bcsr->num_rows && Bcsr->num_cols) {
3205       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3206       PetscCallCUSPARSE(stat);
3207     }
3208     BmatSpDescr = mmdata->matSpBDescr;
3209 #endif
3210   }
3211   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3212   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3213   /* precompute flops count */
3214   if (ptype == MATPRODUCT_AB) {
3215     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3216       const PetscInt st = a->i[i];
3217       const PetscInt en = a->i[i + 1];
3218       for (j = st; j < en; j++) {
3219         const PetscInt brow = a->j[j];
3220         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3221       }
3222     }
3223   } else if (ptype == MATPRODUCT_AtB) {
3224     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3225       const PetscInt anzi = a->i[i + 1] - a->i[i];
3226       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3227       flops += (2. * anzi) * bnzi;
3228     }
3229   } else { /* TODO */
3230     flops = 0.;
3231   }
3232 
3233   mmdata->flops = flops;
3234   PetscCall(PetscLogGpuTimeBegin());
3235 
3236 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3237   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3238   // cuda-12.2 requires non-null csrRowOffsets
3239   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3240   PetscCallCUSPARSE(stat);
3241   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3242   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3243   {
3244     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3245      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3246   */
3247     void *dBuffer1 = NULL;
3248     void *dBuffer2 = NULL;
3249     void *dBuffer3 = NULL;
3250     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3251     size_t bufferSize1 = 0;
3252     size_t bufferSize2 = 0;
3253     size_t bufferSize3 = 0;
3254     size_t bufferSize4 = 0;
3255     size_t bufferSize5 = 0;
3256 
3257     /* ask bufferSize1 bytes for external memory */
3258     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3259     PetscCallCUSPARSE(stat);
3260     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3261     /* inspect the matrices A and B to understand the memory requirement for the next step */
3262     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3263     PetscCallCUSPARSE(stat);
3264 
3265     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3266     PetscCallCUSPARSE(stat);
3267     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3268     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3269     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3270     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3271     PetscCallCUSPARSE(stat);
3272     PetscCallCUDA(cudaFree(dBuffer1));
3273     PetscCallCUDA(cudaFree(dBuffer2));
3274 
3275     /* get matrix C non-zero entries C_nnz1 */
3276     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3277     c->nz = (PetscInt)C_nnz1;
3278     /* allocate matrix C */
3279     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3280     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3281     Ccsr->values = new THRUSTARRAY(c->nz);
3282     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3283     /* update matC with the new pointers */
3284     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3285     PetscCallCUSPARSE(stat);
3286 
3287     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3288     PetscCallCUSPARSE(stat);
3289     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3290     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3291     PetscCallCUSPARSE(stat);
3292     PetscCallCUDA(cudaFree(dBuffer3));
3293     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3294     PetscCallCUSPARSE(stat);
3295     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3296   }
3297   #else
3298   size_t bufSize2;
3299   /* ask bufferSize bytes for external memory */
3300   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3301   PetscCallCUSPARSE(stat);
3302   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3303   /* inspect the matrices A and B to understand the memory requirement for the next step */
3304   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3305   PetscCallCUSPARSE(stat);
3306   /* ask bufferSize again bytes for external memory */
3307   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3308   PetscCallCUSPARSE(stat);
3309   /* The CUSPARSE documentation is not clear, nor the API
3310      We need both buffers to perform the operations properly!
3311      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3312      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3313      is stored in the descriptor! What a messy API... */
3314   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3315   /* compute the intermediate product of A * B */
3316   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3317   PetscCallCUSPARSE(stat);
3318   /* get matrix C non-zero entries C_nnz1 */
3319   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3320   c->nz = (PetscInt)C_nnz1;
3321   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3322                       mmdata->mmBufferSize / 1024));
3323   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3324   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3325   Ccsr->values = new THRUSTARRAY(c->nz);
3326   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3327   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3328   PetscCallCUSPARSE(stat);
3329   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3330   PetscCallCUSPARSE(stat);
3331   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3332 #else
3333   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3334   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3335                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3336   PetscCallCUSPARSE(stat);
3337   c->nz                = cnz;
3338   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3339   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3340   Ccsr->values = new THRUSTARRAY(c->nz);
3341   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3342 
3343   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3344   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3345      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3346      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3347   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3348                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3349   PetscCallCUSPARSE(stat);
3350 #endif
3351   PetscCall(PetscLogGpuFlops(mmdata->flops));
3352   PetscCall(PetscLogGpuTimeEnd());
3353 finalizesym:
3354   c->free_a = PETSC_TRUE;
3355   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3356   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3357   c->free_ij = PETSC_TRUE;
3358   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3359     PetscInt      *d_i = c->i;
3360     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3361     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3362     ii = *Ccsr->row_offsets;
3363     jj = *Ccsr->column_indices;
3364     if (ciscompressed) d_i = c->compressedrow.i;
3365     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3366     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3367   } else {
3368     PetscInt *d_i = c->i;
3369     if (ciscompressed) d_i = c->compressedrow.i;
3370     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3371     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3372   }
3373   if (ciscompressed) { /* need to expand host row offsets */
3374     PetscInt r = 0;
3375     c->i[0]    = 0;
3376     for (k = 0; k < c->compressedrow.nrows; k++) {
3377       const PetscInt next = c->compressedrow.rindex[k];
3378       const PetscInt old  = c->compressedrow.i[k];
3379       for (; r < next; r++) c->i[r + 1] = old;
3380     }
3381     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3382   }
3383   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3384   PetscCall(PetscMalloc1(m, &c->ilen));
3385   PetscCall(PetscMalloc1(m, &c->imax));
3386   c->maxnz         = c->nz;
3387   c->nonzerorowcnt = 0;
3388   c->rmax          = 0;
3389   for (k = 0; k < m; k++) {
3390     const PetscInt nn = c->i[k + 1] - c->i[k];
3391     c->ilen[k] = c->imax[k] = nn;
3392     c->nonzerorowcnt += (PetscInt)!!nn;
3393     c->rmax = PetscMax(c->rmax, nn);
3394   }
3395   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3396   PetscCall(PetscMalloc1(c->nz, &c->a));
3397   Ccsr->num_entries = c->nz;
3398 
3399   C->nonzerostate++;
3400   PetscCall(PetscLayoutSetUp(C->rmap));
3401   PetscCall(PetscLayoutSetUp(C->cmap));
3402   Ccusp->nonzerostate = C->nonzerostate;
3403   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3404   C->preallocated     = PETSC_TRUE;
3405   C->assembled        = PETSC_FALSE;
3406   C->was_assembled    = PETSC_FALSE;
3407   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3408     mmdata->reusesym = PETSC_TRUE;
3409     C->offloadmask   = PETSC_OFFLOAD_GPU;
3410   }
3411   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3412   PetscFunctionReturn(PETSC_SUCCESS);
3413 }
3414 
3415 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3416 
3417 /* handles sparse or dense B */
3418 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3419 {
3420   Mat_Product *product = mat->product;
3421   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3422 
3423   PetscFunctionBegin;
3424   MatCheckProduct(mat, 1);
3425   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3426   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3427   if (product->type == MATPRODUCT_ABC) {
3428     Ciscusp = PETSC_FALSE;
3429     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3430   }
3431   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3432     PetscBool usecpu = PETSC_FALSE;
3433     switch (product->type) {
3434     case MATPRODUCT_AB:
3435       if (product->api_user) {
3436         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3437         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3438         PetscOptionsEnd();
3439       } else {
3440         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3441         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3442         PetscOptionsEnd();
3443       }
3444       break;
3445     case MATPRODUCT_AtB:
3446       if (product->api_user) {
3447         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3448         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3449         PetscOptionsEnd();
3450       } else {
3451         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3452         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3453         PetscOptionsEnd();
3454       }
3455       break;
3456     case MATPRODUCT_PtAP:
3457       if (product->api_user) {
3458         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3459         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3460         PetscOptionsEnd();
3461       } else {
3462         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3463         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3464         PetscOptionsEnd();
3465       }
3466       break;
3467     case MATPRODUCT_RARt:
3468       if (product->api_user) {
3469         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3470         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3471         PetscOptionsEnd();
3472       } else {
3473         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3474         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3475         PetscOptionsEnd();
3476       }
3477       break;
3478     case MATPRODUCT_ABC:
3479       if (product->api_user) {
3480         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3481         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3482         PetscOptionsEnd();
3483       } else {
3484         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3485         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3486         PetscOptionsEnd();
3487       }
3488       break;
3489     default:
3490       break;
3491     }
3492     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3493   }
3494   /* dispatch */
3495   if (isdense) {
3496     switch (product->type) {
3497     case MATPRODUCT_AB:
3498     case MATPRODUCT_AtB:
3499     case MATPRODUCT_ABt:
3500     case MATPRODUCT_PtAP:
3501     case MATPRODUCT_RARt:
3502       if (product->A->boundtocpu) {
3503         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3504       } else {
3505         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3506       }
3507       break;
3508     case MATPRODUCT_ABC:
3509       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3510       break;
3511     default:
3512       break;
3513     }
3514   } else if (Biscusp && Ciscusp) {
3515     switch (product->type) {
3516     case MATPRODUCT_AB:
3517     case MATPRODUCT_AtB:
3518     case MATPRODUCT_ABt:
3519       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3520       break;
3521     case MATPRODUCT_PtAP:
3522     case MATPRODUCT_RARt:
3523     case MATPRODUCT_ABC:
3524       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3525       break;
3526     default:
3527       break;
3528     }
3529   } else { /* fallback for AIJ */
3530     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3531   }
3532   PetscFunctionReturn(PETSC_SUCCESS);
3533 }
3534 
3535 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3536 {
3537   PetscFunctionBegin;
3538   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3539   PetscFunctionReturn(PETSC_SUCCESS);
3540 }
3541 
3542 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3543 {
3544   PetscFunctionBegin;
3545   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3546   PetscFunctionReturn(PETSC_SUCCESS);
3547 }
3548 
3549 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3550 {
3551   PetscFunctionBegin;
3552   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3553   PetscFunctionReturn(PETSC_SUCCESS);
3554 }
3555 
3556 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3557 {
3558   PetscFunctionBegin;
3559   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3560   PetscFunctionReturn(PETSC_SUCCESS);
3561 }
3562 
3563 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3564 {
3565   PetscFunctionBegin;
3566   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3567   PetscFunctionReturn(PETSC_SUCCESS);
3568 }
3569 
3570 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3571 {
3572   int i = blockIdx.x * blockDim.x + threadIdx.x;
3573   if (i < n) y[idx[i]] += x[i];
3574 }
3575 
3576 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3577 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3578 {
3579   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3580   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3581   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3582   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3583   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3584   PetscBool                     compressed;
3585 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3586   PetscInt nx, ny;
3587 #endif
3588 
3589   PetscFunctionBegin;
3590   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3591   if (!a->nz) {
3592     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3593     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3594     PetscFunctionReturn(PETSC_SUCCESS);
3595   }
3596   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3597   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3598   if (!trans) {
3599     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3600     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3601   } else {
3602     if (herm || !A->form_explicit_transpose) {
3603       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3604       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3605     } else {
3606       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3607       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3608     }
3609   }
3610   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3611   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3612 
3613   try {
3614     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3615     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3616     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3617 
3618     PetscCall(PetscLogGpuTimeBegin());
3619     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3620       /* z = A x + beta y.
3621          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3622          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3623       */
3624       xptr = xarray;
3625       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3626       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3627 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3628       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3629           allocated to accommodate different uses. So we get the length info directly from mat.
3630        */
3631       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3632         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3633         nx             = mat->num_cols; // since y = Ax
3634         ny             = mat->num_rows;
3635       }
3636 #endif
3637     } else {
3638       /* z = A^T x + beta y
3639          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3640          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3641        */
3642       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3643       dptr = zarray;
3644       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3645       if (compressed) { /* Scatter x to work vector */
3646         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3647 
3648         thrust::for_each(
3649 #if PetscDefined(HAVE_THRUST_ASYNC)
3650           thrust::cuda::par.on(PetscDefaultCudaStream),
3651 #endif
3652           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3653           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3654       }
3655 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3656       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3657         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3658         nx             = mat->num_rows; // since y = A^T x
3659         ny             = mat->num_cols;
3660       }
3661 #endif
3662     }
3663 
3664     /* csr_spmv does y = alpha op(A) x + beta y */
3665     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3666 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3667   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3668       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3669   #else
3670       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3671   #endif
3672 
3673       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3674   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3675       if (!matDescr) {
3676         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3677         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3678       }
3679   #endif
3680 
3681       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3682         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3683         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3684         PetscCallCUSPARSE(
3685           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3686         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3687   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3688         PetscCallCUSPARSE(
3689           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3690   #endif
3691         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3692       } else {
3693         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3694         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3695         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3696       }
3697 
3698       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3699 #else
3700       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3701       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3702 #endif
3703     } else {
3704       if (cusparsestruct->nrows) {
3705 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3706         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3707 #else
3708         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3709         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3710 #endif
3711       }
3712     }
3713     PetscCall(PetscLogGpuTimeEnd());
3714 
3715     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3716       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3717         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3718           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3719         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3720           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3721         }
3722       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3723         PetscCall(VecSeq_CUDA::Set(zz, 0));
3724       }
3725 
3726       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3727       if (compressed) {
3728         PetscCall(PetscLogGpuTimeBegin());
3729         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3730            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3731            prevent that. So I just add a ScatterAdd kernel.
3732          */
3733 #if 0
3734         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3735         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3736                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3737                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3738                          VecCUDAPlusEquals());
3739 #else
3740         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3741         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3742 #endif
3743         PetscCall(PetscLogGpuTimeEnd());
3744       }
3745     } else {
3746       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3747     }
3748     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3749     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3750     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3751   } catch (char *ex) {
3752     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3753   }
3754   if (yy) {
3755     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3756   } else {
3757     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3758   }
3759   PetscFunctionReturn(PETSC_SUCCESS);
3760 }
3761 
3762 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3763 {
3764   PetscFunctionBegin;
3765   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3766   PetscFunctionReturn(PETSC_SUCCESS);
3767 }
3768 
3769 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3770 {
3771   PetscFunctionBegin;
3772   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3773   PetscFunctionReturn(PETSC_SUCCESS);
3774 }
3775 
3776 /*@
3777   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3778   (the default parallel PETSc format).
3779 
3780   Collective
3781 
3782   Input Parameters:
3783 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3784 . m    - number of rows
3785 . n    - number of columns
3786 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3787 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3788 
3789   Output Parameter:
3790 . A - the matrix
3791 
3792   Level: intermediate
3793 
3794   Notes:
3795   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3796   calculations. For good matrix assembly performance the user should preallocate the matrix
3797   storage by setting the parameter `nz` (or the array `nnz`).
3798 
3799   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3800   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3801   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3802 
3803   The AIJ format, also called
3804   compressed row storage, is fully compatible with standard Fortran
3805   storage.  That is, the stored row and column indices can begin at
3806   either one (as in Fortran) or zero.
3807 
3808   Specify the preallocated storage with either nz or nnz (not both).
3809   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3810   allocation.
3811 
3812 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3813 @*/
3814 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3815 {
3816   PetscFunctionBegin;
3817   PetscCall(MatCreate(comm, A));
3818   PetscCall(MatSetSizes(*A, m, n, m, n));
3819   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3820   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3821   PetscFunctionReturn(PETSC_SUCCESS);
3822 }
3823 
3824 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3825 {
3826   PetscFunctionBegin;
3827   if (A->factortype == MAT_FACTOR_NONE) {
3828     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3829   } else {
3830     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3831   }
3832   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3833   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3834   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3835   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3836   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3837   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3838   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3839   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3840   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3841   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3842   PetscCall(MatDestroy_SeqAIJ(A));
3843   PetscFunctionReturn(PETSC_SUCCESS);
3844 }
3845 
3846 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3847 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3848 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3849 {
3850   PetscFunctionBegin;
3851   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3852   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3853   PetscFunctionReturn(PETSC_SUCCESS);
3854 }
3855 
3856 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3857 {
3858   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3859   Mat_SeqAIJCUSPARSE *cy;
3860   Mat_SeqAIJCUSPARSE *cx;
3861   PetscScalar        *ay;
3862   const PetscScalar  *ax;
3863   CsrMatrix          *csry, *csrx;
3864 
3865   PetscFunctionBegin;
3866   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3867   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3868   if (X->ops->axpy != Y->ops->axpy) {
3869     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3870     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3871     PetscFunctionReturn(PETSC_SUCCESS);
3872   }
3873   /* if we are here, it means both matrices are bound to GPU */
3874   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3875   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3876   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3877   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3878   csry = (CsrMatrix *)cy->mat->mat;
3879   csrx = (CsrMatrix *)cx->mat->mat;
3880   /* see if we can turn this into a cublas axpy */
3881   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3882     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3883     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3884     if (eq) str = SAME_NONZERO_PATTERN;
3885   }
3886   /* spgeam is buggy with one column */
3887   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3888 
3889   if (str == SUBSET_NONZERO_PATTERN) {
3890     PetscScalar b = 1.0;
3891 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3892     size_t bufferSize;
3893     void  *buffer;
3894 #endif
3895 
3896     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3897     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3898     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3899 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3900     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3901                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3902     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3903     PetscCall(PetscLogGpuTimeBegin());
3904     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3905                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3906     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3907     PetscCall(PetscLogGpuTimeEnd());
3908     PetscCallCUDA(cudaFree(buffer));
3909 #else
3910     PetscCall(PetscLogGpuTimeBegin());
3911     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3912                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3913     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3914     PetscCall(PetscLogGpuTimeEnd());
3915 #endif
3916     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3917     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3918     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3919     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3920   } else if (str == SAME_NONZERO_PATTERN) {
3921     cublasHandle_t cublasv2handle;
3922     PetscBLASInt   one = 1, bnz = 1;
3923 
3924     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3925     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3926     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3927     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3928     PetscCall(PetscLogGpuTimeBegin());
3929     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3930     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3931     PetscCall(PetscLogGpuTimeEnd());
3932     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3933     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3934     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3935   } else {
3936     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3937     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3938   }
3939   PetscFunctionReturn(PETSC_SUCCESS);
3940 }
3941 
3942 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3943 {
3944   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3945   PetscScalar   *ay;
3946   cublasHandle_t cublasv2handle;
3947   PetscBLASInt   one = 1, bnz = 1;
3948 
3949   PetscFunctionBegin;
3950   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3951   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3952   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3953   PetscCall(PetscLogGpuTimeBegin());
3954   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3955   PetscCall(PetscLogGpuFlops(bnz));
3956   PetscCall(PetscLogGpuTimeEnd());
3957   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3958   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3959   PetscFunctionReturn(PETSC_SUCCESS);
3960 }
3961 
3962 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3963 {
3964   PetscBool   both = PETSC_FALSE;
3965   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3966 
3967   PetscFunctionBegin;
3968   if (A->factortype == MAT_FACTOR_NONE) {
3969     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3970     if (spptr->mat) {
3971       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3972       if (matrix->values) {
3973         both = PETSC_TRUE;
3974         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3975       }
3976     }
3977     if (spptr->matTranspose) {
3978       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3979       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3980     }
3981   }
3982   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3983   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3984   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3985   else A->offloadmask = PETSC_OFFLOAD_CPU;
3986   PetscFunctionReturn(PETSC_SUCCESS);
3987 }
3988 
3989 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3990 {
3991   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3992 
3993   PetscFunctionBegin;
3994   if (A->factortype != MAT_FACTOR_NONE) {
3995     A->boundtocpu = flg;
3996     PetscFunctionReturn(PETSC_SUCCESS);
3997   }
3998   if (flg) {
3999     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4000 
4001     A->ops->scale                     = MatScale_SeqAIJ;
4002     A->ops->axpy                      = MatAXPY_SeqAIJ;
4003     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4004     A->ops->mult                      = MatMult_SeqAIJ;
4005     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4006     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4007     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4008     A->ops->multhermitiantranspose    = NULL;
4009     A->ops->multhermitiantransposeadd = NULL;
4010     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4011     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4012     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4013     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4014     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4015     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4016     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4017     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4018   } else {
4019     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4020     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4021     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4022     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4023     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4024     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4025     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4026     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4027     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4028     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4029     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4030     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4031     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4032     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4033     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4034     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4035     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4036 
4037     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4038     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4039     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4040     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4041     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4042     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4043   }
4044   A->boundtocpu = flg;
4045   if (flg && a->inode.size_csr) {
4046     a->inode.use = PETSC_TRUE;
4047   } else {
4048     a->inode.use = PETSC_FALSE;
4049   }
4050   PetscFunctionReturn(PETSC_SUCCESS);
4051 }
4052 
4053 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4054 {
4055   Mat B;
4056 
4057   PetscFunctionBegin;
4058   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4059   if (reuse == MAT_INITIAL_MATRIX) {
4060     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4061   } else if (reuse == MAT_REUSE_MATRIX) {
4062     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4063   }
4064   B = *newmat;
4065 
4066   PetscCall(PetscFree(B->defaultvectype));
4067   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4068 
4069   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4070     if (B->factortype == MAT_FACTOR_NONE) {
4071       Mat_SeqAIJCUSPARSE *spptr;
4072       PetscCall(PetscNew(&spptr));
4073       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4074       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4075       spptr->format = MAT_CUSPARSE_CSR;
4076 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4077   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4078       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4079   #else
4080       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4081   #endif
4082       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4083       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4084 #endif
4085       B->spptr = spptr;
4086     } else {
4087       Mat_SeqAIJCUSPARSETriFactors *spptr;
4088 
4089       PetscCall(PetscNew(&spptr));
4090       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4091       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4092       B->spptr = spptr;
4093     }
4094     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4095   }
4096   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4097   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4098   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4099   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4100   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4101   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4102 
4103   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4104   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4105   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4106 #if defined(PETSC_HAVE_HYPRE)
4107   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4108 #endif
4109   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4110   PetscFunctionReturn(PETSC_SUCCESS);
4111 }
4112 
4113 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4114 {
4115   PetscFunctionBegin;
4116   PetscCall(MatCreate_SeqAIJ(B));
4117   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4118   PetscFunctionReturn(PETSC_SUCCESS);
4119 }
4120 
4121 /*MC
4122    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4123 
4124    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4125    CSR, ELL, or Hybrid format.
4126    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4127 
4128    Options Database Keys:
4129 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4130 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4131                                       Other options include ell (ellpack) or hyb (hybrid).
4132 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4133 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4134 
4135   Level: beginner
4136 
4137 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4138 M*/
4139 
4140 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4141 {
4142   PetscFunctionBegin;
4143   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4144   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4145   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4146   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4147   PetscFunctionReturn(PETSC_SUCCESS);
4148 }
4149 
4150 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4151 {
4152   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4153 
4154   PetscFunctionBegin;
4155   if (cusp) {
4156     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4157     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4158     delete cusp->workVector;
4159     delete cusp->rowoffsets_gpu;
4160     delete cusp->csr2csc_i;
4161     delete cusp->coords;
4162     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4163     PetscCall(PetscFree(mat->spptr));
4164   }
4165   PetscFunctionReturn(PETSC_SUCCESS);
4166 }
4167 
4168 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4169 {
4170   PetscFunctionBegin;
4171   if (*mat) {
4172     delete (*mat)->values;
4173     delete (*mat)->column_indices;
4174     delete (*mat)->row_offsets;
4175     delete *mat;
4176     *mat = 0;
4177   }
4178   PetscFunctionReturn(PETSC_SUCCESS);
4179 }
4180 
4181 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4182 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4183 {
4184   PetscFunctionBegin;
4185   if (*trifactor) {
4186     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4187     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4188     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4189     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4190     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4191   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4192     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4193   #endif
4194     PetscCall(PetscFree(*trifactor));
4195   }
4196   PetscFunctionReturn(PETSC_SUCCESS);
4197 }
4198 #endif
4199 
4200 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4201 {
4202   CsrMatrix *mat;
4203 
4204   PetscFunctionBegin;
4205   if (*matstruct) {
4206     if ((*matstruct)->mat) {
4207       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4208 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4209         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4210 #else
4211         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4212         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4213 #endif
4214       } else {
4215         mat = (CsrMatrix *)(*matstruct)->mat;
4216         PetscCall(CsrMatrix_Destroy(&mat));
4217       }
4218     }
4219     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4220     delete (*matstruct)->cprowIndices;
4221     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4222     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4223     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4224 
4225 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4226     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4227     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4228 
4229     for (int i = 0; i < 3; i++) {
4230       if (mdata->cuSpMV[i].initialized) {
4231         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4232         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4233         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4234   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4235         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4236         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4237   #endif
4238       }
4239     }
4240 #endif
4241     delete *matstruct;
4242     *matstruct = NULL;
4243   }
4244   PetscFunctionReturn(PETSC_SUCCESS);
4245 }
4246 
4247 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4248 {
4249   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4250 
4251   PetscFunctionBegin;
4252   if (fs) {
4253 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4254     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4255     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4256     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4257     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4258     delete fs->workVector;
4259     fs->workVector = NULL;
4260 #endif
4261     delete fs->rpermIndices;
4262     delete fs->cpermIndices;
4263     fs->rpermIndices  = NULL;
4264     fs->cpermIndices  = NULL;
4265     fs->init_dev_prop = PETSC_FALSE;
4266 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4267     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4268     PetscCallCUDA(cudaFree(fs->csrColIdx));
4269     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4270     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4271     PetscCallCUDA(cudaFree(fs->csrVal));
4272     PetscCallCUDA(cudaFree(fs->diag));
4273     PetscCallCUDA(cudaFree(fs->X));
4274     PetscCallCUDA(cudaFree(fs->Y));
4275     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4276     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4277     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4278     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4279     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4280     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4281     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4282     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4283     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4284     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4285     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4286     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4287     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4288     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4289     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4290     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4291     PetscCall(PetscFree(fs->csrRowPtr_h));
4292     PetscCall(PetscFree(fs->csrVal_h));
4293     PetscCall(PetscFree(fs->diag_h));
4294     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4295     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4296 #endif
4297   }
4298   PetscFunctionReturn(PETSC_SUCCESS);
4299 }
4300 
4301 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4302 {
4303   PetscFunctionBegin;
4304   if (*trifactors) {
4305     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4306     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4307     PetscCall(PetscFree(*trifactors));
4308   }
4309   PetscFunctionReturn(PETSC_SUCCESS);
4310 }
4311 
4312 struct IJCompare {
4313   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4314   {
4315     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4316     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4317     return false;
4318   }
4319 };
4320 
4321 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4322 {
4323   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4324 
4325   PetscFunctionBegin;
4326   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4327   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4328   if (destroy) {
4329     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4330     delete cusp->csr2csc_i;
4331     cusp->csr2csc_i = NULL;
4332   }
4333   A->transupdated = PETSC_FALSE;
4334   PetscFunctionReturn(PETSC_SUCCESS);
4335 }
4336 
4337 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4338 {
4339   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4340 
4341   PetscFunctionBegin;
4342   PetscCallCUDA(cudaFree(coo->perm));
4343   PetscCallCUDA(cudaFree(coo->jmap));
4344   PetscCall(PetscFree(coo));
4345   PetscFunctionReturn(PETSC_SUCCESS);
4346 }
4347 
4348 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4349 {
4350   PetscBool            dev_ij = PETSC_FALSE;
4351   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4352   PetscInt            *i, *j;
4353   PetscContainer       container_h;
4354   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4355 
4356   PetscFunctionBegin;
4357   PetscCall(PetscGetMemType(coo_i, &mtype));
4358   if (PetscMemTypeDevice(mtype)) {
4359     dev_ij = PETSC_TRUE;
4360     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4361     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4362     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4363   } else {
4364     i = coo_i;
4365     j = coo_j;
4366   }
4367 
4368   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4369   if (dev_ij) PetscCall(PetscFree2(i, j));
4370   mat->offloadmask = PETSC_OFFLOAD_CPU;
4371   // Create the GPU memory
4372   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4373 
4374   // Copy the COO struct to device
4375   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4376   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4377   PetscCall(PetscMalloc1(1, &coo_d));
4378   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4379   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4380   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4381   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4382   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4383 
4384   // Put the COO struct in a container and then attach that to the matrix
4385   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4386   PetscFunctionReturn(PETSC_SUCCESS);
4387 }
4388 
4389 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4390 {
4391   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4392   const PetscCount grid_size = gridDim.x * blockDim.x;
4393   for (; i < nnz; i += grid_size) {
4394     PetscScalar sum = 0.0;
4395     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4396     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4397   }
4398 }
4399 
4400 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4401 {
4402   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4403   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4404   PetscCount           Annz = seq->nz;
4405   PetscMemType         memtype;
4406   const PetscScalar   *v1 = v;
4407   PetscScalar         *Aa;
4408   PetscContainer       container;
4409   MatCOOStruct_SeqAIJ *coo;
4410 
4411   PetscFunctionBegin;
4412   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4413 
4414   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4415   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4416 
4417   PetscCall(PetscGetMemType(v, &memtype));
4418   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4419     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4420     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4421   }
4422 
4423   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4424   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4425 
4426   PetscCall(PetscLogGpuTimeBegin());
4427   if (Annz) {
4428     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4429     PetscCallCUDA(cudaPeekAtLastError());
4430   }
4431   PetscCall(PetscLogGpuTimeEnd());
4432 
4433   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4434   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4435 
4436   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4437   PetscFunctionReturn(PETSC_SUCCESS);
4438 }
4439 
4440 /*@C
4441   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4442 
4443   Not Collective
4444 
4445   Input Parameters:
4446 + A          - the matrix
4447 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4448 
4449   Output Parameters:
4450 + i - the CSR row pointers
4451 - j - the CSR column indices
4452 
4453   Level: developer
4454 
4455   Note:
4456   When compressed is true, the CSR structure does not contain empty rows
4457 
4458 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4459 @*/
4460 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4461 {
4462   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4463   CsrMatrix          *csr;
4464   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4465 
4466   PetscFunctionBegin;
4467   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4468   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4469   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4471   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4472   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473   csr = (CsrMatrix *)cusp->mat->mat;
4474   if (i) {
4475     if (!compressed && a->compressedrow.use) { /* need full row offset */
4476       if (!cusp->rowoffsets_gpu) {
4477         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4478         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4479         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4480       }
4481       *i = cusp->rowoffsets_gpu->data().get();
4482     } else *i = csr->row_offsets->data().get();
4483   }
4484   if (j) *j = csr->column_indices->data().get();
4485   PetscFunctionReturn(PETSC_SUCCESS);
4486 }
4487 
4488 /*@C
4489   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4490 
4491   Not Collective
4492 
4493   Input Parameters:
4494 + A          - the matrix
4495 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4496 . i          - the CSR row pointers
4497 - j          - the CSR column indices
4498 
4499   Level: developer
4500 
4501 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4502 @*/
4503 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4504 {
4505   PetscFunctionBegin;
4506   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4507   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4508   if (i) *i = NULL;
4509   if (j) *j = NULL;
4510   (void)compressed;
4511   PetscFunctionReturn(PETSC_SUCCESS);
4512 }
4513 
4514 /*@C
4515   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4516 
4517   Not Collective
4518 
4519   Input Parameter:
4520 . A - a `MATSEQAIJCUSPARSE` matrix
4521 
4522   Output Parameter:
4523 . a - pointer to the device data
4524 
4525   Level: developer
4526 
4527   Note:
4528   May trigger host-device copies if up-to-date matrix data is on host
4529 
4530 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4531 @*/
4532 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4533 {
4534   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4535   CsrMatrix          *csr;
4536 
4537   PetscFunctionBegin;
4538   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4539   PetscAssertPointer(a, 2);
4540   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4541   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4542   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4543   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4544   csr = (CsrMatrix *)cusp->mat->mat;
4545   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4546   *a = csr->values->data().get();
4547   PetscFunctionReturn(PETSC_SUCCESS);
4548 }
4549 
4550 /*@C
4551   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4552 
4553   Not Collective
4554 
4555   Input Parameters:
4556 + A - a `MATSEQAIJCUSPARSE` matrix
4557 - a - pointer to the device data
4558 
4559   Level: developer
4560 
4561 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4562 @*/
4563 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4564 {
4565   PetscFunctionBegin;
4566   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4567   PetscAssertPointer(a, 2);
4568   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4569   *a = NULL;
4570   PetscFunctionReturn(PETSC_SUCCESS);
4571 }
4572 
4573 /*@C
4574   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4575 
4576   Not Collective
4577 
4578   Input Parameter:
4579 . A - a `MATSEQAIJCUSPARSE` matrix
4580 
4581   Output Parameter:
4582 . a - pointer to the device data
4583 
4584   Level: developer
4585 
4586   Note:
4587   May trigger host-device copies if up-to-date matrix data is on host
4588 
4589 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4590 @*/
4591 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4592 {
4593   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4594   CsrMatrix          *csr;
4595 
4596   PetscFunctionBegin;
4597   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4598   PetscAssertPointer(a, 2);
4599   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4600   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4601   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4602   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4603   csr = (CsrMatrix *)cusp->mat->mat;
4604   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4605   *a             = csr->values->data().get();
4606   A->offloadmask = PETSC_OFFLOAD_GPU;
4607   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4608   PetscFunctionReturn(PETSC_SUCCESS);
4609 }
4610 /*@C
4611   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4612 
4613   Not Collective
4614 
4615   Input Parameters:
4616 + A - a `MATSEQAIJCUSPARSE` matrix
4617 - a - pointer to the device data
4618 
4619   Level: developer
4620 
4621 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4622 @*/
4623 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4624 {
4625   PetscFunctionBegin;
4626   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4627   PetscAssertPointer(a, 2);
4628   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4629   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4630   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4631   *a = NULL;
4632   PetscFunctionReturn(PETSC_SUCCESS);
4633 }
4634 
4635 /*@C
4636   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4637 
4638   Not Collective
4639 
4640   Input Parameter:
4641 . A - a `MATSEQAIJCUSPARSE` matrix
4642 
4643   Output Parameter:
4644 . a - pointer to the device data
4645 
4646   Level: developer
4647 
4648   Note:
4649   Does not trigger host-device copies and flags data validity on the GPU
4650 
4651 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4652 @*/
4653 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4654 {
4655   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4656   CsrMatrix          *csr;
4657 
4658   PetscFunctionBegin;
4659   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4660   PetscAssertPointer(a, 2);
4661   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4662   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4663   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4664   csr = (CsrMatrix *)cusp->mat->mat;
4665   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4666   *a             = csr->values->data().get();
4667   A->offloadmask = PETSC_OFFLOAD_GPU;
4668   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4669   PetscFunctionReturn(PETSC_SUCCESS);
4670 }
4671 
4672 /*@C
4673   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4674 
4675   Not Collective
4676 
4677   Input Parameters:
4678 + A - a `MATSEQAIJCUSPARSE` matrix
4679 - a - pointer to the device data
4680 
4681   Level: developer
4682 
4683 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4684 @*/
4685 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4686 {
4687   PetscFunctionBegin;
4688   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4689   PetscAssertPointer(a, 2);
4690   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4691   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4692   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4693   *a = NULL;
4694   PetscFunctionReturn(PETSC_SUCCESS);
4695 }
4696 
4697 struct IJCompare4 {
4698   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4699   {
4700     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4701     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4702     return false;
4703   }
4704 };
4705 
4706 struct Shift {
4707   int _shift;
4708 
4709   Shift(int shift) : _shift(shift) { }
4710   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4711 };
4712 
4713 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4714 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4715 {
4716   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4717   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4718   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4719   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4720   PetscInt                      Annz, Bnnz;
4721   cusparseStatus_t              stat;
4722   PetscInt                      i, m, n, zero = 0;
4723 
4724   PetscFunctionBegin;
4725   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4726   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4727   PetscAssertPointer(C, 4);
4728   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4729   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4730   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4731   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4732   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4734   if (reuse == MAT_INITIAL_MATRIX) {
4735     m = A->rmap->n;
4736     n = A->cmap->n + B->cmap->n;
4737     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4738     PetscCall(MatSetSizes(*C, m, n, m, n));
4739     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4740     c                       = (Mat_SeqAIJ *)(*C)->data;
4741     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4742     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4743     Ccsr                    = new CsrMatrix;
4744     Cmat->cprowIndices      = NULL;
4745     c->compressedrow.use    = PETSC_FALSE;
4746     c->compressedrow.nrows  = 0;
4747     c->compressedrow.i      = NULL;
4748     c->compressedrow.rindex = NULL;
4749     Ccusp->workVector       = NULL;
4750     Ccusp->nrows            = m;
4751     Ccusp->mat              = Cmat;
4752     Ccusp->mat->mat         = Ccsr;
4753     Ccsr->num_rows          = m;
4754     Ccsr->num_cols          = n;
4755     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4756     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4757     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4758     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4759     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4760     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4761     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4763     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4764     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4765     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4766     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4767     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4768 
4769     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4770     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4771     Annz                 = (PetscInt)Acsr->column_indices->size();
4772     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4773     c->nz                = Annz + Bnnz;
4774     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4775     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4776     Ccsr->values         = new THRUSTARRAY(c->nz);
4777     Ccsr->num_entries    = c->nz;
4778     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4779     if (c->nz) {
4780       auto              Acoo = new THRUSTINTARRAY32(Annz);
4781       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4782       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4783       THRUSTINTARRAY32 *Aroff, *Broff;
4784 
4785       if (a->compressedrow.use) { /* need full row offset */
4786         if (!Acusp->rowoffsets_gpu) {
4787           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4788           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4789           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4790         }
4791         Aroff = Acusp->rowoffsets_gpu;
4792       } else Aroff = Acsr->row_offsets;
4793       if (b->compressedrow.use) { /* need full row offset */
4794         if (!Bcusp->rowoffsets_gpu) {
4795           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4796           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4797           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4798         }
4799         Broff = Bcusp->rowoffsets_gpu;
4800       } else Broff = Bcsr->row_offsets;
4801       PetscCall(PetscLogGpuTimeBegin());
4802       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4803       PetscCallCUSPARSE(stat);
4804       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4805       PetscCallCUSPARSE(stat);
4806       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4807       auto Aperm = thrust::make_constant_iterator(1);
4808       auto Bperm = thrust::make_constant_iterator(0);
4809 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4810       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4811       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4812 #else
4813       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4814       auto Bcib = Bcsr->column_indices->begin();
4815       auto Bcie = Bcsr->column_indices->end();
4816       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4817 #endif
4818       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4819       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4820       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4821       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4822       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4823       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4824       auto p1    = Ccusp->coords->begin();
4825       auto p2    = Ccusp->coords->begin();
4826       thrust::advance(p2, Annz);
4827       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4828 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4829       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4830 #endif
4831       auto cci = thrust::make_counting_iterator(zero);
4832       auto cce = thrust::make_counting_iterator(c->nz);
4833 #if 0 //Errors on SUMMIT cuda 11.1.0
4834       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4835 #else
4836       auto pred = thrust::identity<int>();
4837       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4838       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4839 #endif
4840       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4841       PetscCallCUSPARSE(stat);
4842       PetscCall(PetscLogGpuTimeEnd());
4843       delete wPerm;
4844       delete Acoo;
4845       delete Bcoo;
4846       delete Ccoo;
4847 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4848       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4849       PetscCallCUSPARSE(stat);
4850 #endif
4851       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4852         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4853         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4854         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4855         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4856         CsrMatrix                    *CcsrT = new CsrMatrix;
4857         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4858         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4859 
4860         (*C)->form_explicit_transpose = PETSC_TRUE;
4861         (*C)->transupdated            = PETSC_TRUE;
4862         Ccusp->rowoffsets_gpu         = NULL;
4863         CmatT->cprowIndices           = NULL;
4864         CmatT->mat                    = CcsrT;
4865         CcsrT->num_rows               = n;
4866         CcsrT->num_cols               = m;
4867         CcsrT->num_entries            = c->nz;
4868 
4869         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4870         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4871         CcsrT->values         = new THRUSTARRAY(c->nz);
4872 
4873         PetscCall(PetscLogGpuTimeBegin());
4874         auto rT = CcsrT->row_offsets->begin();
4875         if (AT) {
4876           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4877           thrust::advance(rT, -1);
4878         }
4879         if (BT) {
4880           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4881           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4882           thrust::copy(titb, tite, rT);
4883         }
4884         auto cT = CcsrT->column_indices->begin();
4885         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4886         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4887         auto vT = CcsrT->values->begin();
4888         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4889         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4890         PetscCall(PetscLogGpuTimeEnd());
4891 
4892         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4893         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4894         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4895         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4896         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4897         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4898         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4899         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4900         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4901 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4902         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4903         PetscCallCUSPARSE(stat);
4904 #endif
4905         Ccusp->matTranspose = CmatT;
4906       }
4907     }
4908 
4909     c->free_a = PETSC_TRUE;
4910     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4911     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4912     c->free_ij = PETSC_TRUE;
4913     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4914       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4915       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4916       ii = *Ccsr->row_offsets;
4917       jj = *Ccsr->column_indices;
4918       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4919       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4920     } else {
4921       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4922       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4923     }
4924     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4925     PetscCall(PetscMalloc1(m, &c->ilen));
4926     PetscCall(PetscMalloc1(m, &c->imax));
4927     c->maxnz         = c->nz;
4928     c->nonzerorowcnt = 0;
4929     c->rmax          = 0;
4930     for (i = 0; i < m; i++) {
4931       const PetscInt nn = c->i[i + 1] - c->i[i];
4932       c->ilen[i] = c->imax[i] = nn;
4933       c->nonzerorowcnt += (PetscInt)!!nn;
4934       c->rmax = PetscMax(c->rmax, nn);
4935     }
4936     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4937     PetscCall(PetscMalloc1(c->nz, &c->a));
4938     (*C)->nonzerostate++;
4939     PetscCall(PetscLayoutSetUp((*C)->rmap));
4940     PetscCall(PetscLayoutSetUp((*C)->cmap));
4941     Ccusp->nonzerostate = (*C)->nonzerostate;
4942     (*C)->preallocated  = PETSC_TRUE;
4943   } else {
4944     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4945     c = (Mat_SeqAIJ *)(*C)->data;
4946     if (c->nz) {
4947       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4948       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4949       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4950       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4951       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4952       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4953       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4954       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4955       Acsr = (CsrMatrix *)Acusp->mat->mat;
4956       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4957       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4958       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4959       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4960       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4961       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4962       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4963       auto pmid = Ccusp->coords->begin();
4964       thrust::advance(pmid, Acsr->num_entries);
4965       PetscCall(PetscLogGpuTimeBegin());
4966       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4967       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4968       thrust::for_each(zibait, zieait, VecCUDAEquals());
4969       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4970       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4971       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4972       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4973       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4974         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4975         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4976         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4977         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4978         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4979         auto       vT    = CcsrT->values->begin();
4980         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4981         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4982         (*C)->transupdated = PETSC_TRUE;
4983       }
4984       PetscCall(PetscLogGpuTimeEnd());
4985     }
4986   }
4987   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4988   (*C)->assembled     = PETSC_TRUE;
4989   (*C)->was_assembled = PETSC_FALSE;
4990   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4991   PetscFunctionReturn(PETSC_SUCCESS);
4992 }
4993 
4994 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4995 {
4996   bool               dmem;
4997   const PetscScalar *av;
4998 
4999   PetscFunctionBegin;
5000   dmem = isCudaMem(v);
5001   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5002   if (n && idx) {
5003     THRUSTINTARRAY widx(n);
5004     widx.assign(idx, idx + n);
5005     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5006 
5007     THRUSTARRAY                    *w = NULL;
5008     thrust::device_ptr<PetscScalar> dv;
5009     if (dmem) {
5010       dv = thrust::device_pointer_cast(v);
5011     } else {
5012       w  = new THRUSTARRAY(n);
5013       dv = w->data();
5014     }
5015     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5016 
5017     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5018     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5019     thrust::for_each(zibit, zieit, VecCUDAEquals());
5020     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5021     delete w;
5022   } else {
5023     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5024   }
5025   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5026   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5027   PetscFunctionReturn(PETSC_SUCCESS);
5028 }
5029 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
5030