xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision dafee7125fca1e076965f187c9cfc0a41982905b)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24   #include <cuda/std/functional>
25 #endif
26 
27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29 /*
30   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32 */
33 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36 #endif
37 
38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48 #endif
49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
59 
60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
64 
65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
67 
68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71 
72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
75 
76   PetscFunctionBegin;
77   switch (op) {
78   case MAT_CUSPARSE_MULT:
79     cusparsestruct->format = format;
80     break;
81   case MAT_CUSPARSE_ALL:
82     cusparsestruct->format = format;
83     break;
84   default:
85     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86   }
87   PetscFunctionReturn(PETSC_SUCCESS);
88 }
89 
90 /*@
91   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92   operation. Only the `MatMult()` operation can use different GPU storage formats
93 
94   Not Collective
95 
96   Input Parameters:
97 + A      - Matrix of type `MATSEQAIJCUSPARSE`
98 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101 
102   Level: intermediate
103 
104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105 @*/
106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107 {
108   PetscFunctionBegin;
109   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117 
118   PetscFunctionBegin;
119   cusparsestruct->use_cpu_solve = use_cpu;
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /*@
124   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125 
126   Input Parameters:
127 + A       - Matrix of type `MATSEQAIJCUSPARSE`
128 - use_cpu - set flag for using the built-in CPU `MatSolve()`
129 
130   Level: intermediate
131 
132   Note:
133   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136 
137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138 @*/
139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140 {
141   PetscFunctionBegin;
142   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148 {
149   PetscFunctionBegin;
150   switch (op) {
151   case MAT_FORM_EXPLICIT_TRANSPOSE:
152     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154     A->form_explicit_transpose = flg;
155     break;
156   default:
157     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158     break;
159   }
160   PetscFunctionReturn(PETSC_SUCCESS);
161 }
162 
163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164 {
165   MatCUSPARSEStorageFormat format;
166   PetscBool                flg;
167   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
168 
169   PetscFunctionBegin;
170   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171   if (A->factortype == MAT_FACTOR_NONE) {
172     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174 
175     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184   #else
185     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186   #endif
187     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189 
190     PetscCall(
191       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193 #endif
194   }
195   PetscOptionsHeadEnd();
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201 {
202   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203   PetscInt                      m  = A->rmap->n;
204   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
206   const MatScalar              *Aa = a->a;
207   PetscInt                     *Mi, *Mj, Mnz;
208   PetscScalar                  *Ma;
209 
210   PetscFunctionBegin;
211   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
212     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
213       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
214       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
215       PetscCall(PetscMalloc1(m + 1, &Mi));
216       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
217       PetscCall(PetscMalloc1(Mnz, &Ma));
218       Mi[0] = 0;
219       for (PetscInt i = 0; i < m; i++) {
220         PetscInt llen = Ai[i + 1] - Ai[i];
221         PetscInt ulen = Adiag[i] - Adiag[i + 1];
222         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
223         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
224         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
225         Mi[i + 1] = Mi[i] + llen + ulen;
226       }
227       // Copy M (L,U) from host to device
228       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
229       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
230       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
231       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
232       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
233 
234       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
235       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
236       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
237       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
238       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
239       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
240       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
241       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
242 
243       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
244       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
245       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
246 
247       fillMode = CUSPARSE_FILL_MODE_UPPER;
248       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
249       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
250       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
251       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
252 
253       // Allocate work vectors in SpSv
254       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
255       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
256 
257       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
258       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
259 
260       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
261       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
262       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
263       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
264       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
265       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
266       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
267 
268       // Record for reuse
269       fs->csrRowPtr_h = Mi;
270       fs->csrVal_h    = Ma;
271       PetscCall(PetscFree(Mj));
272     }
273     // Copy the value
274     Mi  = fs->csrRowPtr_h;
275     Ma  = fs->csrVal_h;
276     Mnz = Mi[m];
277     for (PetscInt i = 0; i < m; i++) {
278       PetscInt llen = Ai[i + 1] - Ai[i];
279       PetscInt ulen = Adiag[i] - Adiag[i + 1];
280       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
281       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
282       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
283     }
284     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
285 
286   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
287     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
288       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
289       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
290       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291     } else
292   #endif
293     {
294       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
295       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
296 
297       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
298       fs->updatedSpSVAnalysis          = PETSC_TRUE;
299       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
300     }
301   }
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 #else
305 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
306 {
307   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
308   PetscInt                           n                  = A->rmap->n;
309   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
310   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
311   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
312   const MatScalar                   *aa = a->a, *v;
313   PetscInt                          *AiLo, *AjLo;
314   PetscInt                           i, nz, nzLower, offset, rowOffset;
315 
316   PetscFunctionBegin;
317   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
318   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
319     try {
320       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
321       nzLower = n + ai[n] - ai[1];
322       if (!loTriFactor) {
323         PetscScalar *AALo;
324 
325         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
326 
327         /* Allocate Space for the lower triangular matrix */
328         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
329         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
330 
331         /* Fill the lower triangular matrix */
332         AiLo[0]   = (PetscInt)0;
333         AiLo[n]   = nzLower;
334         AjLo[0]   = (PetscInt)0;
335         AALo[0]   = (MatScalar)1.0;
336         v         = aa;
337         vi        = aj;
338         offset    = 1;
339         rowOffset = 1;
340         for (i = 1; i < n; i++) {
341           nz = ai[i + 1] - ai[i];
342           /* additional 1 for the term on the diagonal */
343           AiLo[i] = rowOffset;
344           rowOffset += nz + 1;
345 
346           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
347           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
348 
349           offset += nz;
350           AjLo[offset] = (PetscInt)i;
351           AALo[offset] = (MatScalar)1.0;
352           offset += 1;
353 
354           v += nz;
355           vi += nz;
356         }
357 
358         /* allocate space for the triangular factor information */
359         PetscCall(PetscNew(&loTriFactor));
360         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
361         /* Create the matrix description */
362         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
363         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
364   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
365         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
366   #else
367         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
368   #endif
369         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
370         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
371 
372         /* set the operation */
373         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
374 
375         /* set the matrix */
376         loTriFactor->csrMat              = new CsrMatrix;
377         loTriFactor->csrMat->num_rows    = n;
378         loTriFactor->csrMat->num_cols    = n;
379         loTriFactor->csrMat->num_entries = nzLower;
380 
381         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
382         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
383 
384         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
385         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
386 
387         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
388         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
389 
390         /* Create the solve analysis information */
391         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
392         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
393   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
394         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
395                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
396         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
397   #endif
398 
399         /* perform the solve analysis */
400         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
401                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
402         PetscCallCUDA(WaitForCUDA());
403         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
404 
405         /* assign the pointer */
406         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
407         loTriFactor->AA_h                                          = AALo;
408         PetscCallCUDA(cudaFreeHost(AiLo));
409         PetscCallCUDA(cudaFreeHost(AjLo));
410         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
411       } else { /* update values only */
412         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
413         /* Fill the lower triangular matrix */
414         loTriFactor->AA_h[0] = 1.0;
415         v                    = aa;
416         vi                   = aj;
417         offset               = 1;
418         for (i = 1; i < n; i++) {
419           nz = ai[i + 1] - ai[i];
420           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
421           offset += nz;
422           loTriFactor->AA_h[offset] = 1.0;
423           offset += 1;
424           v += nz;
425         }
426         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
427         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
428       }
429     } catch (char *ex) {
430       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
431     }
432   }
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
437 {
438   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
439   PetscInt                           n                  = A->rmap->n;
440   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
441   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
442   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
443   const MatScalar                   *aa = a->a, *v;
444   PetscInt                          *AiUp, *AjUp;
445   PetscInt                           i, nz, nzUpper, offset;
446 
447   PetscFunctionBegin;
448   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
449   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
450     try {
451       /* next, figure out the number of nonzeros in the upper triangular matrix. */
452       nzUpper = adiag[0] - adiag[n];
453       if (!upTriFactor) {
454         PetscScalar *AAUp;
455 
456         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
457 
458         /* Allocate Space for the upper triangular matrix */
459         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
460         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
461 
462         /* Fill the upper triangular matrix */
463         AiUp[0] = (PetscInt)0;
464         AiUp[n] = nzUpper;
465         offset  = nzUpper;
466         for (i = n - 1; i >= 0; i--) {
467           v  = aa + adiag[i + 1] + 1;
468           vi = aj + adiag[i + 1] + 1;
469 
470           /* number of elements NOT on the diagonal */
471           nz = adiag[i] - adiag[i + 1] - 1;
472 
473           /* decrement the offset */
474           offset -= (nz + 1);
475 
476           /* first, set the diagonal elements */
477           AjUp[offset] = (PetscInt)i;
478           AAUp[offset] = (MatScalar)1. / v[nz];
479           AiUp[i]      = AiUp[i + 1] - (nz + 1);
480 
481           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
482           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
483         }
484 
485         /* allocate space for the triangular factor information */
486         PetscCall(PetscNew(&upTriFactor));
487         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
488 
489         /* Create the matrix description */
490         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
491         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
492   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
493         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
494   #else
495         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
496   #endif
497         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
498         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
499 
500         /* set the operation */
501         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502 
503         /* set the matrix */
504         upTriFactor->csrMat              = new CsrMatrix;
505         upTriFactor->csrMat->num_rows    = n;
506         upTriFactor->csrMat->num_cols    = n;
507         upTriFactor->csrMat->num_entries = nzUpper;
508 
509         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
510         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
511 
512         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
513         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
514 
515         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
516         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
517 
518         /* Create the solve analysis information */
519         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
520         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
521   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
522         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
523                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
524         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
525   #endif
526 
527         /* perform the solve analysis */
528         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
529                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
530 
531         PetscCallCUDA(WaitForCUDA());
532         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
533 
534         /* assign the pointer */
535         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
536         upTriFactor->AA_h                                          = AAUp;
537         PetscCallCUDA(cudaFreeHost(AiUp));
538         PetscCallCUDA(cudaFreeHost(AjUp));
539         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
540       } else {
541         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
542         /* Fill the upper triangular matrix */
543         offset = nzUpper;
544         for (i = n - 1; i >= 0; i--) {
545           v = aa + adiag[i + 1] + 1;
546 
547           /* number of elements NOT on the diagonal */
548           nz = adiag[i] - adiag[i + 1] - 1;
549 
550           /* decrement the offset */
551           offset -= (nz + 1);
552 
553           /* first, set the diagonal elements */
554           upTriFactor->AA_h[offset] = 1. / v[nz];
555           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
556         }
557         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
558         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
559       }
560     } catch (char *ex) {
561       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
562     }
563   }
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 #endif
567 
568 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
569 {
570   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
571   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572   IS                            isrow = a->row, isicol = a->icol;
573   PetscBool                     row_identity, col_identity;
574   PetscInt                      n = A->rmap->n;
575 
576   PetscFunctionBegin;
577   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
578 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
579   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
580 #else
581   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
582   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
583   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
584 #endif
585 
586   cusparseTriFactors->nnz = a->nz;
587 
588   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
589   /* lower triangular indices */
590   PetscCall(ISIdentity(isrow, &row_identity));
591   if (!row_identity && !cusparseTriFactors->rpermIndices) {
592     const PetscInt *r;
593 
594     PetscCall(ISGetIndices(isrow, &r));
595     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
596     cusparseTriFactors->rpermIndices->assign(r, r + n);
597     PetscCall(ISRestoreIndices(isrow, &r));
598     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
599   }
600 
601   /* upper triangular indices */
602   PetscCall(ISIdentity(isicol, &col_identity));
603   if (!col_identity && !cusparseTriFactors->cpermIndices) {
604     const PetscInt *c;
605 
606     PetscCall(ISGetIndices(isicol, &c));
607     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
608     cusparseTriFactors->cpermIndices->assign(c, c + n);
609     PetscCall(ISRestoreIndices(isicol, &c));
610     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
611   }
612   PetscFunctionReturn(PETSC_SUCCESS);
613 }
614 
615 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
616 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
617 {
618   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
619   PetscInt                      m  = A->rmap->n;
620   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
621   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
622   const MatScalar              *Aa = a->a;
623   PetscInt                     *Mj, Mnz;
624   PetscScalar                  *Ma, *D;
625 
626   PetscFunctionBegin;
627   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
628     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
629       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
630       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
631       Mnz = Ai[m]; // Unz (with the unit diagonal)
632       PetscCall(PetscMalloc1(Mnz, &Ma));
633       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
634       PetscCall(PetscMalloc1(m, &D));    // the diagonal
635       for (PetscInt i = 0; i < m; i++) {
636         PetscInt ulen = Ai[i + 1] - Ai[i];
637         Mj[Ai[i]]     = i;                                              // diagonal entry
638         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
639       }
640       // Copy M (U) from host to device
641       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
642       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
643       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
644       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
645       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
646       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
647 
648       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
649       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
650       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
651       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
652       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
653       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
654       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
655       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
656 
657       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
658       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
659       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
660 
661       // Allocate work vectors in SpSv
662       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
663       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
664 
665       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
666       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
667 
668       // Query buffer sizes for SpSV and then allocate buffers
669       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
670       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
671       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
672 
673       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
674       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
675       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
676 
677       // Record for reuse
678       fs->csrVal_h = Ma;
679       fs->diag_h   = D;
680       PetscCall(PetscFree(Mj));
681     }
682     // Copy the value
683     Ma  = fs->csrVal_h;
684     D   = fs->diag_h;
685     Mnz = Ai[m];
686     for (PetscInt i = 0; i < m; i++) {
687       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
688       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
689       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
690     }
691     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
692     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
693 
694   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
695     if (fs->updatedSpSVAnalysis) {
696       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
697       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
698     } else
699   #endif
700     {
701       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
702       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
703       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
704       fs->updatedSpSVAnalysis = PETSC_TRUE;
705     }
706   }
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 // Solve Ut D U x = b
711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
712 {
713   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
714   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
715   const PetscScalar                    *barray;
716   PetscScalar                          *xarray;
717   thrust::device_ptr<const PetscScalar> bGPU;
718   thrust::device_ptr<PetscScalar>       xGPU;
719   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
720   PetscInt                              m   = A->rmap->n;
721 
722   PetscFunctionBegin;
723   PetscCall(PetscLogGpuTimeBegin());
724   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
725   PetscCall(VecCUDAGetArrayRead(b, &barray));
726   xGPU = thrust::device_pointer_cast(xarray);
727   bGPU = thrust::device_pointer_cast(barray);
728 
729   // Reorder b with the row permutation if needed, and wrap the result in fs->X
730   if (fs->rpermIndices) {
731     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
732     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
733   } else {
734     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
735   }
736 
737   // Solve Ut Y = X
738   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
739   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
740 
741   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
742   // It is basically a vector element-wise multiplication, but cublas does not have it!
743   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
744 
745   // Solve U X = Y
746   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
747     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
748   } else {
749     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
750   }
751   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
752 
753   // Reorder X with the column permutation if needed, and put the result back to x
754   if (fs->cpermIndices) {
755     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
756                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
757   }
758 
759   PetscCall(VecCUDARestoreArrayRead(b, &barray));
760   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
761   PetscCall(PetscLogGpuTimeEnd());
762   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 #else
766 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
767 {
768   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
769   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
770   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
771   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
772   PetscInt                          *AiUp, *AjUp;
773   PetscScalar                       *AAUp;
774   PetscScalar                       *AALo;
775   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
776   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
777   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
778   const MatScalar                   *aa = b->a, *v;
779 
780   PetscFunctionBegin;
781   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
782   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
783     try {
784       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
785       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
786       if (!upTriFactor && !loTriFactor) {
787         /* Allocate Space for the upper triangular matrix */
788         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
789         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
790 
791         /* Fill the upper triangular matrix */
792         AiUp[0] = (PetscInt)0;
793         AiUp[n] = nzUpper;
794         offset  = 0;
795         for (i = 0; i < n; i++) {
796           /* set the pointers */
797           v  = aa + ai[i];
798           vj = aj + ai[i];
799           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
800 
801           /* first, set the diagonal elements */
802           AjUp[offset] = (PetscInt)i;
803           AAUp[offset] = (MatScalar)1.0 / v[nz];
804           AiUp[i]      = offset;
805           AALo[offset] = (MatScalar)1.0 / v[nz];
806 
807           offset += 1;
808           if (nz > 0) {
809             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
810             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
811             for (j = offset; j < offset + nz; j++) {
812               AAUp[j] = -AAUp[j];
813               AALo[j] = AAUp[j] / v[nz];
814             }
815             offset += nz;
816           }
817         }
818 
819         /* allocate space for the triangular factor information */
820         PetscCall(PetscNew(&upTriFactor));
821         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822 
823         /* Create the matrix description */
824         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
825         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
826   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
827         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
828   #else
829         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
830   #endif
831         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
832         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
833 
834         /* set the matrix */
835         upTriFactor->csrMat              = new CsrMatrix;
836         upTriFactor->csrMat->num_rows    = A->rmap->n;
837         upTriFactor->csrMat->num_cols    = A->cmap->n;
838         upTriFactor->csrMat->num_entries = a->nz;
839 
840         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
841         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
842 
843         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
844         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
845 
846         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
847         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
848 
849         /* set the operation */
850         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
851 
852         /* Create the solve analysis information */
853         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
854         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
855   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
856         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
857                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
858         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
859   #endif
860 
861         /* perform the solve analysis */
862         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
863                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
864 
865         PetscCallCUDA(WaitForCUDA());
866         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
867 
868         /* assign the pointer */
869         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
870 
871         /* allocate space for the triangular factor information */
872         PetscCall(PetscNew(&loTriFactor));
873         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
874 
875         /* Create the matrix description */
876         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
877         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
878   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
879         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
880   #else
881         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
882   #endif
883         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
884         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
885 
886         /* set the operation */
887         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
888 
889         /* set the matrix */
890         loTriFactor->csrMat              = new CsrMatrix;
891         loTriFactor->csrMat->num_rows    = A->rmap->n;
892         loTriFactor->csrMat->num_cols    = A->cmap->n;
893         loTriFactor->csrMat->num_entries = a->nz;
894 
895         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
896         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
897 
898         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
899         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
900 
901         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
902         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
903 
904         /* Create the solve analysis information */
905         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
906         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
907   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
908         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
909                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
910         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
911   #endif
912 
913         /* perform the solve analysis */
914         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
915                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
916 
917         PetscCallCUDA(WaitForCUDA());
918         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
919 
920         /* assign the pointer */
921         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
922 
923         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
924         PetscCallCUDA(cudaFreeHost(AiUp));
925         PetscCallCUDA(cudaFreeHost(AjUp));
926       } else {
927         /* Fill the upper triangular matrix */
928         offset = 0;
929         for (i = 0; i < n; i++) {
930           /* set the pointers */
931           v  = aa + ai[i];
932           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
933 
934           /* first, set the diagonal elements */
935           AAUp[offset] = 1.0 / v[nz];
936           AALo[offset] = 1.0 / v[nz];
937 
938           offset += 1;
939           if (nz > 0) {
940             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
941             for (j = offset; j < offset + nz; j++) {
942               AAUp[j] = -AAUp[j];
943               AALo[j] = AAUp[j] / v[nz];
944             }
945             offset += nz;
946           }
947         }
948         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
949         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
950         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
951         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
952         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
953       }
954       PetscCallCUDA(cudaFreeHost(AAUp));
955       PetscCallCUDA(cudaFreeHost(AALo));
956     } catch (char *ex) {
957       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
958     }
959   }
960   PetscFunctionReturn(PETSC_SUCCESS);
961 }
962 #endif
963 
964 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
965 {
966   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
967   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
968   IS                            ip                 = a->row;
969   PetscBool                     perm_identity;
970   PetscInt                      n = A->rmap->n;
971 
972   PetscFunctionBegin;
973   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
974 
975 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
976   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
977 #else
978   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
979   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
980 #endif
981   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
982 
983   A->offloadmask = PETSC_OFFLOAD_BOTH;
984 
985   /* lower triangular indices */
986   PetscCall(ISIdentity(ip, &perm_identity));
987   if (!perm_identity) {
988     IS              iip;
989     const PetscInt *irip, *rip;
990 
991     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
992     PetscCall(ISGetIndices(iip, &irip));
993     PetscCall(ISGetIndices(ip, &rip));
994     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
995     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
996     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
997     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
998     PetscCall(ISRestoreIndices(iip, &irip));
999     PetscCall(ISDestroy(&iip));
1000     PetscCall(ISRestoreIndices(ip, &rip));
1001     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1002   }
1003   PetscFunctionReturn(PETSC_SUCCESS);
1004 }
1005 
1006 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1007 {
1008   PetscFunctionBegin;
1009   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1010   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1011   B->offloadmask = PETSC_OFFLOAD_CPU;
1012 
1013 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1014   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1015   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1016 #else
1017   /* determine which version of MatSolve needs to be used. */
1018   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1019   IS          ip = b->row;
1020   PetscBool   perm_identity;
1021 
1022   PetscCall(ISIdentity(ip, &perm_identity));
1023   if (perm_identity) {
1024     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1025     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1026   } else {
1027     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1028     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1029   }
1030 #endif
1031   B->ops->matsolve          = NULL;
1032   B->ops->matsolvetranspose = NULL;
1033 
1034   /* get the triangular factors */
1035   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1036   PetscFunctionReturn(PETSC_SUCCESS);
1037 }
1038 
1039 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1040 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1041 {
1042   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1043   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1044   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1045   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1046   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1047   cusparseIndexBase_t                indexBase;
1048   cusparseMatrixType_t               matrixType;
1049   cusparseFillMode_t                 fillMode;
1050   cusparseDiagType_t                 diagType;
1051 
1052   PetscFunctionBegin;
1053   /* allocate space for the transpose of the lower triangular factor */
1054   PetscCall(PetscNew(&loTriFactorT));
1055   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1056 
1057   /* set the matrix descriptors of the lower triangular factor */
1058   matrixType = cusparseGetMatType(loTriFactor->descr);
1059   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1060   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1061   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1062 
1063   /* Create the matrix description */
1064   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1065   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1066   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1067   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1068   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1069 
1070   /* set the operation */
1071   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1072 
1073   /* allocate GPU space for the CSC of the lower triangular factor*/
1074   loTriFactorT->csrMat                 = new CsrMatrix;
1075   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1076   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1077   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1078   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1079   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1080   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1081 
1082   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1083   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1084   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1085                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1086                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1087   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1088   #endif
1089 
1090   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1091   {
1092     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1093     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1094                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1095   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1096                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1097   #else
1098                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1099   #endif
1100     PetscCallCUSPARSE(stat);
1101   }
1102 
1103   PetscCallCUDA(WaitForCUDA());
1104   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1105 
1106   /* Create the solve analysis information */
1107   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1108   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1109   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1110   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1111                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1112   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1113   #endif
1114 
1115   /* perform the solve analysis */
1116   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1117                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1118 
1119   PetscCallCUDA(WaitForCUDA());
1120   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1121 
1122   /* assign the pointer */
1123   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1124 
1125   /*********************************************/
1126   /* Now the Transpose of the Upper Tri Factor */
1127   /*********************************************/
1128 
1129   /* allocate space for the transpose of the upper triangular factor */
1130   PetscCall(PetscNew(&upTriFactorT));
1131   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1132 
1133   /* set the matrix descriptors of the upper triangular factor */
1134   matrixType = cusparseGetMatType(upTriFactor->descr);
1135   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1136   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1137   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1138 
1139   /* Create the matrix description */
1140   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1141   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1142   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1143   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1144   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1145 
1146   /* set the operation */
1147   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1148 
1149   /* allocate GPU space for the CSC of the upper triangular factor*/
1150   upTriFactorT->csrMat                 = new CsrMatrix;
1151   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1152   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1153   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1154   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1155   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1156   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1157 
1158   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1159   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1160   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1161                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1163   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1164   #endif
1165 
1166   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1167   {
1168     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1169     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1170                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1171   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1172                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1173   #else
1174                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1175   #endif
1176     PetscCallCUSPARSE(stat);
1177   }
1178 
1179   PetscCallCUDA(WaitForCUDA());
1180   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1181 
1182   /* Create the solve analysis information */
1183   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1184   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1185   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1186   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1187                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1188   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1189   #endif
1190 
1191   /* perform the solve analysis */
1192   /* christ, would it have killed you to put this stuff in a function????????? */
1193   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1195 
1196   PetscCallCUDA(WaitForCUDA());
1197   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1198 
1199   /* assign the pointer */
1200   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1201   PetscFunctionReturn(PETSC_SUCCESS);
1202 }
1203 #endif
1204 
1205 struct PetscScalarToPetscInt {
1206   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1207 };
1208 
1209 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1210 {
1211   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1212   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1213   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1214   cusparseStatus_t              stat;
1215   cusparseIndexBase_t           indexBase;
1216 
1217   PetscFunctionBegin;
1218   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1219   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1220   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1221   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1222   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1223   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1224   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1225   PetscCall(PetscLogGpuTimeBegin());
1226   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1227   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1228     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1229     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1230     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1231     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1232     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1233 
1234     /* set alpha and beta */
1235     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1236     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1237     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1238     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1239     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1240     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1241 
1242     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1243       CsrMatrix *matrixT      = new CsrMatrix;
1244       matstructT->mat         = matrixT;
1245       matrixT->num_rows       = A->cmap->n;
1246       matrixT->num_cols       = A->rmap->n;
1247       matrixT->num_entries    = a->nz;
1248       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1249       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1250       matrixT->values         = new THRUSTARRAY(a->nz);
1251 
1252       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1253       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1254 
1255 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1256   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1257       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1258                                indexBase, cusparse_scalartype);
1259       PetscCallCUSPARSE(stat);
1260   #else
1261       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1262            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1263 
1264            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1265            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1266            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1267         */
1268       if (matrixT->num_entries) {
1269         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1270         PetscCallCUSPARSE(stat);
1271 
1272       } else {
1273         matstructT->matDescr = NULL;
1274         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1275       }
1276   #endif
1277 #endif
1278     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1279 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1280       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1281 #else
1282       CsrMatrix *temp  = new CsrMatrix;
1283       CsrMatrix *tempT = new CsrMatrix;
1284       /* First convert HYB to CSR */
1285       temp->num_rows       = A->rmap->n;
1286       temp->num_cols       = A->cmap->n;
1287       temp->num_entries    = a->nz;
1288       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1289       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1290       temp->values         = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1293       PetscCallCUSPARSE(stat);
1294 
1295       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1296       tempT->num_rows       = A->rmap->n;
1297       tempT->num_cols       = A->cmap->n;
1298       tempT->num_entries    = a->nz;
1299       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1300       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1301       tempT->values         = new THRUSTARRAY(a->nz);
1302 
1303       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1304                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1305       PetscCallCUSPARSE(stat);
1306 
1307       /* Last, convert CSC to HYB */
1308       cusparseHybMat_t hybMat;
1309       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1310       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* assign the pointer */
1315       matstructT->mat = hybMat;
1316       A->transupdated = PETSC_TRUE;
1317       /* delete temporaries */
1318       if (tempT) {
1319         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1320         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1321         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1322         delete (CsrMatrix *)tempT;
1323       }
1324       if (temp) {
1325         if (temp->values) delete (THRUSTARRAY *)temp->values;
1326         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1327         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1328         delete (CsrMatrix *)temp;
1329       }
1330 #endif
1331     }
1332   }
1333   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1334     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1335     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1336     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1337     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1338     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1339     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1340     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1341     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1342     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1343     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1344     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1345       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1346       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1347       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1348     }
1349     if (!cusparsestruct->csr2csc_i) {
1350       THRUSTARRAY csr2csc_a(matrix->num_entries);
1351       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1352 
1353       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1354 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1355       void  *csr2cscBuffer;
1356       size_t csr2cscBufferSize;
1357       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1358                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1359       PetscCallCUSPARSE(stat);
1360       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1361 #endif
1362 
1363       if (matrix->num_entries) {
1364         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1365            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1366            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1367 
1368            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1369            should be filled with indexBase. So I just take a shortcut here.
1370         */
1371         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1372 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1373                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1374         PetscCallCUSPARSE(stat);
1375 #else
1376                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1377         PetscCallCUSPARSE(stat);
1378 #endif
1379       } else {
1380         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1381       }
1382 
1383       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1384       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1385 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1386       PetscCallCUDA(cudaFree(csr2cscBuffer));
1387 #endif
1388     }
1389     PetscCallThrust(
1390       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1391   }
1392   PetscCall(PetscLogGpuTimeEnd());
1393   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1394   /* the compressed row indices is not used for matTranspose */
1395   matstructT->cprowIndices = NULL;
1396   /* assign the pointer */
1397   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1398   A->transupdated                                = PETSC_TRUE;
1399   PetscFunctionReturn(PETSC_SUCCESS);
1400 }
1401 
1402 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1403 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1404 {
1405   const PetscScalar                    *barray;
1406   PetscScalar                          *xarray;
1407   thrust::device_ptr<const PetscScalar> bGPU;
1408   thrust::device_ptr<PetscScalar>       xGPU;
1409   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1410   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1411   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1412   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1413   PetscInt                              m   = A->rmap->n;
1414 
1415   PetscFunctionBegin;
1416   PetscCall(PetscLogGpuTimeBegin());
1417   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1418   PetscCall(VecCUDAGetArrayRead(b, &barray));
1419   xGPU = thrust::device_pointer_cast(xarray);
1420   bGPU = thrust::device_pointer_cast(barray);
1421 
1422   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1423   if (fs->rpermIndices) {
1424     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1425     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1426   } else {
1427     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1428   }
1429 
1430   // Solve L Y = X
1431   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1432   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1433   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1434 
1435   // Solve U X = Y
1436   if (fs->cpermIndices) {
1437     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1438   } else {
1439     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1440   }
1441   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1442 
1443   // Reorder X with the column permutation if needed, and put the result back to x
1444   if (fs->cpermIndices) {
1445     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1446                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1447   }
1448   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1449   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1450   PetscCall(PetscLogGpuTimeEnd());
1451   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1452   PetscFunctionReturn(PETSC_SUCCESS);
1453 }
1454 
1455 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1456 {
1457   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1458   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1459   const PetscScalar                    *barray;
1460   PetscScalar                          *xarray;
1461   thrust::device_ptr<const PetscScalar> bGPU;
1462   thrust::device_ptr<PetscScalar>       xGPU;
1463   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1464   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1465   PetscInt                              m   = A->rmap->n;
1466 
1467   PetscFunctionBegin;
1468   PetscCall(PetscLogGpuTimeBegin());
1469   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1470     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1471     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1472                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1473 
1474     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1475     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1476     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1477     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1478     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1479   }
1480 
1481   if (!fs->updatedTransposeSpSVAnalysis) {
1482     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1483 
1484     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1485     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1486   }
1487 
1488   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1489   PetscCall(VecCUDAGetArrayRead(b, &barray));
1490   xGPU = thrust::device_pointer_cast(xarray);
1491   bGPU = thrust::device_pointer_cast(barray);
1492 
1493   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1494   if (fs->rpermIndices) {
1495     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1496     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1497   } else {
1498     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1499   }
1500 
1501   // Solve Ut Y = X
1502   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1503   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1504 
1505   // Solve Lt X = Y
1506   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1507     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1508   } else {
1509     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1510   }
1511   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1512 
1513   // Reorder X with the column permutation if needed, and put the result back to x
1514   if (fs->cpermIndices) {
1515     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1516                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1517   }
1518 
1519   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1520   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1521   PetscCall(PetscLogGpuTimeEnd());
1522   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1523   PetscFunctionReturn(PETSC_SUCCESS);
1524 }
1525 #else
1526 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1528 {
1529   PetscInt                              n = xx->map->n;
1530   const PetscScalar                    *barray;
1531   PetscScalar                          *xarray;
1532   thrust::device_ptr<const PetscScalar> bGPU;
1533   thrust::device_ptr<PetscScalar>       xGPU;
1534   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1535   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1536   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1537   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1538 
1539   PetscFunctionBegin;
1540   /* Analyze the matrix and create the transpose ... on the fly */
1541   if (!loTriFactorT && !upTriFactorT) {
1542     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1543     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545   }
1546 
1547   /* Get the GPU pointers */
1548   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1549   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1550   xGPU = thrust::device_pointer_cast(xarray);
1551   bGPU = thrust::device_pointer_cast(barray);
1552 
1553   PetscCall(PetscLogGpuTimeBegin());
1554   /* First, reorder with the row permutation */
1555   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1556 
1557   /* First, solve U */
1558   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1559                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1560 
1561   /* Then, solve L */
1562   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1563                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1564 
1565   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1566   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1567 
1568   /* Copy the temporary to the full solution. */
1569   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1570 
1571   /* restore */
1572   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1573   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1574   PetscCall(PetscLogGpuTimeEnd());
1575   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1576   PetscFunctionReturn(PETSC_SUCCESS);
1577 }
1578 
1579 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1580 {
1581   const PetscScalar                 *barray;
1582   PetscScalar                       *xarray;
1583   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1584   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1585   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1586   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1587 
1588   PetscFunctionBegin;
1589   /* Analyze the matrix and create the transpose ... on the fly */
1590   if (!loTriFactorT && !upTriFactorT) {
1591     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1592     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594   }
1595 
1596   /* Get the GPU pointers */
1597   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1598   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1599 
1600   PetscCall(PetscLogGpuTimeBegin());
1601   /* First, solve U */
1602   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1603                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1604 
1605   /* Then, solve L */
1606   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1607                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1608 
1609   /* restore */
1610   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1611   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1612   PetscCall(PetscLogGpuTimeEnd());
1613   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1614   PetscFunctionReturn(PETSC_SUCCESS);
1615 }
1616 
1617 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1618 {
1619   const PetscScalar                    *barray;
1620   PetscScalar                          *xarray;
1621   thrust::device_ptr<const PetscScalar> bGPU;
1622   thrust::device_ptr<PetscScalar>       xGPU;
1623   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1624   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1625   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1626   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1627 
1628   PetscFunctionBegin;
1629   /* Get the GPU pointers */
1630   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1631   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1632   xGPU = thrust::device_pointer_cast(xarray);
1633   bGPU = thrust::device_pointer_cast(barray);
1634 
1635   PetscCall(PetscLogGpuTimeBegin());
1636   /* First, reorder with the row permutation */
1637   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1638 
1639   /* Next, solve L */
1640   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1641                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1642 
1643   /* Then, solve U */
1644   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1645                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1646 
1647   /* Last, reorder with the column permutation */
1648   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1649 
1650   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1651   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1652   PetscCall(PetscLogGpuTimeEnd());
1653   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1654   PetscFunctionReturn(PETSC_SUCCESS);
1655 }
1656 
1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1658 {
1659   const PetscScalar                 *barray;
1660   PetscScalar                       *xarray;
1661   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1662   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1663   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1664   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1665 
1666   PetscFunctionBegin;
1667   /* Get the GPU pointers */
1668   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1669   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1670 
1671   PetscCall(PetscLogGpuTimeBegin());
1672   /* First, solve L */
1673   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1674                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1675 
1676   /* Next, solve U */
1677   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1678                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1679 
1680   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1681   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1682   PetscCall(PetscLogGpuTimeEnd());
1683   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1684   PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686 #endif
1687 
1688 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1689 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1690 {
1691   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1692   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1693   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1694   CsrMatrix                    *Acsr;
1695   PetscInt                      m, nz;
1696   PetscBool                     flg;
1697 
1698   PetscFunctionBegin;
1699   if (PetscDefined(USE_DEBUG)) {
1700     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1701     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1702   }
1703 
1704   /* Copy A's value to fact */
1705   m  = fact->rmap->n;
1706   nz = aij->nz;
1707   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1708   Acsr = (CsrMatrix *)Acusp->mat->mat;
1709   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1710 
1711   PetscCall(PetscLogGpuTimeBegin());
1712   /* Factorize fact inplace */
1713   if (m)
1714     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1715                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1716   if (PetscDefined(USE_DEBUG)) {
1717     int              numerical_zero;
1718     cusparseStatus_t status;
1719     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1720     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1721   }
1722 
1723   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1724   if (fs->updatedSpSVAnalysis) {
1725     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1726     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1727   } else
1728   #endif
1729   {
1730     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1731      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1732     */
1733     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1734 
1735     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1736 
1737     fs->updatedSpSVAnalysis = PETSC_TRUE;
1738     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1739     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740   }
1741 
1742   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1743   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1744   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1745   fact->ops->matsolve          = NULL;
1746   fact->ops->matsolvetranspose = NULL;
1747   PetscCall(PetscLogGpuTimeEnd());
1748   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1749   PetscFunctionReturn(PETSC_SUCCESS);
1750 }
1751 
1752 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1753 {
1754   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1755   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1756   PetscInt                      m, nz;
1757 
1758   PetscFunctionBegin;
1759   if (PetscDefined(USE_DEBUG)) {
1760     PetscInt  i;
1761     PetscBool flg, missing;
1762 
1763     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1764     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1765     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1766     PetscCall(MatMissingDiagonal(A, &missing, &i));
1767     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1768   }
1769 
1770   /* Free the old stale stuff */
1771   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1772 
1773   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1774      but they will not be used. Allocate them just for easy debugging.
1775    */
1776   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1777 
1778   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1779   fact->factortype             = MAT_FACTOR_ILU;
1780   fact->info.factor_mallocs    = 0;
1781   fact->info.fill_ratio_given  = info->fill;
1782   fact->info.fill_ratio_needed = 1.0;
1783 
1784   aij->row = NULL;
1785   aij->col = NULL;
1786 
1787   /* ====================================================================== */
1788   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1789   /* We'll do in-place factorization on fact                                */
1790   /* ====================================================================== */
1791   const int *Ai, *Aj;
1792 
1793   m  = fact->rmap->n;
1794   nz = aij->nz;
1795 
1796   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1797   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1798   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1799   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1800   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1802 
1803   /* ====================================================================== */
1804   /* Create descriptors for M, L, U                                         */
1805   /* ====================================================================== */
1806   cusparseFillMode_t fillMode;
1807   cusparseDiagType_t diagType;
1808 
1809   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1810   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1811   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1812 
1813   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1814     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1815     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1816     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1817     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1818   */
1819   fillMode = CUSPARSE_FILL_MODE_LOWER;
1820   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1821   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1822   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1823   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1824 
1825   fillMode = CUSPARSE_FILL_MODE_UPPER;
1826   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1827   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1828   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1829   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1830 
1831   /* ========================================================================= */
1832   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1833   /* ========================================================================= */
1834   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1835   if (m)
1836     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1837                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1838 
1839   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1840   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1841 
1842   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1843   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1844 
1845   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1846   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1847 
1848   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1849   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1850 
1851   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1852      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1853      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1854      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1855    */
1856   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1857     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1858     fs->spsvBuffer_L = fs->factBuffer_M;
1859     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1860   } else {
1861     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1862     fs->spsvBuffer_U = fs->factBuffer_M;
1863     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1864   }
1865 
1866   /* ========================================================================== */
1867   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1868   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1869   /* ========================================================================== */
1870   int              structural_zero;
1871   cusparseStatus_t status;
1872 
1873   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1874   if (m)
1875     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1876                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1877   if (PetscDefined(USE_DEBUG)) {
1878     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1879     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1880     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1881   }
1882 
1883   /* Estimate FLOPs of the numeric factorization */
1884   {
1885     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1886     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1887     PetscLogDouble flops = 0.0;
1888 
1889     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1890     Ai    = Aseq->i;
1891     Adiag = Aseq->diag;
1892     for (PetscInt i = 0; i < m; i++) {
1893       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1894         nzRow  = Ai[i + 1] - Ai[i];
1895         nzLeft = Adiag[i] - Ai[i];
1896         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1897           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1898         */
1899         nzLeft = (nzRow - 1) / 2;
1900         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1901       }
1902     }
1903     fs->numericFactFlops = flops;
1904   }
1905   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1906   PetscFunctionReturn(PETSC_SUCCESS);
1907 }
1908 
1909 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1910 {
1911   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1912   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1913   const PetscScalar            *barray;
1914   PetscScalar                  *xarray;
1915 
1916   PetscFunctionBegin;
1917   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1918   PetscCall(VecCUDAGetArrayRead(b, &barray));
1919   PetscCall(PetscLogGpuTimeBegin());
1920 
1921   /* Solve L*y = b */
1922   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1923   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1924   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1925                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1926 
1927   /* Solve Lt*x = y */
1928   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1929   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1930                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1931 
1932   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1933   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1934 
1935   PetscCall(PetscLogGpuTimeEnd());
1936   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1937   PetscFunctionReturn(PETSC_SUCCESS);
1938 }
1939 
1940 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1941 {
1942   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1943   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1944   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1945   CsrMatrix                    *Acsr;
1946   PetscInt                      m, nz;
1947   PetscBool                     flg;
1948 
1949   PetscFunctionBegin;
1950   if (PetscDefined(USE_DEBUG)) {
1951     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1952     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1953   }
1954 
1955   /* Copy A's value to fact */
1956   m  = fact->rmap->n;
1957   nz = aij->nz;
1958   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1959   Acsr = (CsrMatrix *)Acusp->mat->mat;
1960   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1961 
1962   /* Factorize fact inplace */
1963   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1964      csric02() only takes the lower triangular part of matrix A to perform factorization.
1965      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1966      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1967      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1968    */
1969   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1970   if (PetscDefined(USE_DEBUG)) {
1971     int              numerical_zero;
1972     cusparseStatus_t status;
1973     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1974     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1975   }
1976 
1977   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1978   if (fs->updatedSpSVAnalysis) {
1979     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1980     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1981   } else
1982   #endif
1983   {
1984     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1985 
1986     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1987     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1988   */
1989     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1990     fs->updatedSpSVAnalysis = PETSC_TRUE;
1991   }
1992 
1993   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1994   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1995   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1996   fact->ops->matsolve          = NULL;
1997   fact->ops->matsolvetranspose = NULL;
1998   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1999   PetscFunctionReturn(PETSC_SUCCESS);
2000 }
2001 
2002 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2003 {
2004   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2005   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2006   PetscInt                      m, nz;
2007 
2008   PetscFunctionBegin;
2009   if (PetscDefined(USE_DEBUG)) {
2010     PetscInt  i;
2011     PetscBool flg, missing;
2012 
2013     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2014     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2015     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2016     PetscCall(MatMissingDiagonal(A, &missing, &i));
2017     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2018   }
2019 
2020   /* Free the old stale stuff */
2021   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2022 
2023   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2024      but they will not be used. Allocate them just for easy debugging.
2025    */
2026   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2027 
2028   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2029   fact->factortype             = MAT_FACTOR_ICC;
2030   fact->info.factor_mallocs    = 0;
2031   fact->info.fill_ratio_given  = info->fill;
2032   fact->info.fill_ratio_needed = 1.0;
2033 
2034   aij->row = NULL;
2035   aij->col = NULL;
2036 
2037   /* ====================================================================== */
2038   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2039   /* We'll do in-place factorization on fact                                */
2040   /* ====================================================================== */
2041   const int *Ai, *Aj;
2042 
2043   m  = fact->rmap->n;
2044   nz = aij->nz;
2045 
2046   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2047   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2048   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2049   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2050   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2051   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2052 
2053   /* ====================================================================== */
2054   /* Create mat descriptors for M, L                                        */
2055   /* ====================================================================== */
2056   cusparseFillMode_t fillMode;
2057   cusparseDiagType_t diagType;
2058 
2059   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2060   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2061   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2062 
2063   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2064     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2065     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2066     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2067     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2068   */
2069   fillMode = CUSPARSE_FILL_MODE_LOWER;
2070   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2071   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2072   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2073   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2074 
2075   /* ========================================================================= */
2076   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2077   /* ========================================================================= */
2078   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2079   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2080 
2081   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2082   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2083 
2084   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2085   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2086 
2087   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2088   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2089 
2090   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2091   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2092 
2093   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2094      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2095    */
2096   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2097     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2098     fs->spsvBuffer_L = fs->factBuffer_M;
2099     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2100   } else {
2101     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2102     fs->spsvBuffer_Lt = fs->factBuffer_M;
2103     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2104   }
2105 
2106   /* ========================================================================== */
2107   /* Perform analysis of ic0 on M                                               */
2108   /* The lower triangular part of M has the same sparsity pattern as L          */
2109   /* ========================================================================== */
2110   int              structural_zero;
2111   cusparseStatus_t status;
2112 
2113   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2114   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2115   if (PetscDefined(USE_DEBUG)) {
2116     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2117     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2118     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2119   }
2120 
2121   /* Estimate FLOPs of the numeric factorization */
2122   {
2123     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2124     PetscInt      *Ai, nzRow, nzLeft;
2125     PetscLogDouble flops = 0.0;
2126 
2127     Ai = Aseq->i;
2128     for (PetscInt i = 0; i < m; i++) {
2129       nzRow = Ai[i + 1] - Ai[i];
2130       if (nzRow > 1) {
2131         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2132           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2133         */
2134         nzLeft = (nzRow - 1) / 2;
2135         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2136       }
2137     }
2138     fs->numericFactFlops = flops;
2139   }
2140   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2141   PetscFunctionReturn(PETSC_SUCCESS);
2142 }
2143 #endif
2144 
2145 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2146 {
2147   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2148   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2149 
2150   PetscFunctionBegin;
2151   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2152   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2153   B->offloadmask = PETSC_OFFLOAD_CPU;
2154 
2155   if (!cusparsestruct->use_cpu_solve) {
2156 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2157     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2158     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2159 #else
2160     /* determine which version of MatSolve needs to be used. */
2161     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2162     IS          isrow = b->row, iscol = b->col;
2163     PetscBool   row_identity, col_identity;
2164 
2165     PetscCall(ISIdentity(isrow, &row_identity));
2166     PetscCall(ISIdentity(iscol, &col_identity));
2167     if (row_identity && col_identity) {
2168       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2169       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2170     } else {
2171       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2172       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2173     }
2174 #endif
2175   }
2176   B->ops->matsolve          = NULL;
2177   B->ops->matsolvetranspose = NULL;
2178 
2179   /* get the triangular factors */
2180   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2181   PetscFunctionReturn(PETSC_SUCCESS);
2182 }
2183 
2184 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2185 {
2186   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2187 
2188   PetscFunctionBegin;
2189   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2190   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2191   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2192   PetscFunctionReturn(PETSC_SUCCESS);
2193 }
2194 
2195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2196 {
2197   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2198 
2199   PetscFunctionBegin;
2200 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2201   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2202   if (!info->factoronhost) {
2203     PetscCall(ISIdentity(isrow, &row_identity));
2204     PetscCall(ISIdentity(iscol, &col_identity));
2205   }
2206   if (!info->levels && row_identity && col_identity) {
2207     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2208   } else
2209 #endif
2210   {
2211     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2212     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2213     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2214   }
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217 
2218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2219 {
2220   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2221 
2222   PetscFunctionBegin;
2223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2224   PetscBool perm_identity = PETSC_FALSE;
2225   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2226   if (!info->levels && perm_identity) {
2227     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2228   } else
2229 #endif
2230   {
2231     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2232     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2233     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2234   }
2235   PetscFunctionReturn(PETSC_SUCCESS);
2236 }
2237 
2238 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2239 {
2240   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2241 
2242   PetscFunctionBegin;
2243   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2244   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2245   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2246   PetscFunctionReturn(PETSC_SUCCESS);
2247 }
2248 
2249 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2250 {
2251   PetscFunctionBegin;
2252   *type = MATSOLVERCUSPARSE;
2253   PetscFunctionReturn(PETSC_SUCCESS);
2254 }
2255 
2256 /*MC
2257   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2258   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2259   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2260   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2261   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2262   algorithms are not recommended. This class does NOT support direct solver operations.
2263 
2264   Level: beginner
2265 
2266 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2267           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2268 M*/
2269 
2270 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2271 {
2272   PetscInt n = A->rmap->n;
2273 
2274   PetscFunctionBegin;
2275   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2276   PetscCall(MatSetSizes(*B, n, n, n, n));
2277   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2278   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2279 
2280   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2281   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2282     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2283     if (!A->boundtocpu) {
2284       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2285       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2286     } else {
2287       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2288       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2289     }
2290     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2291     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2292     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2293   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2294     if (!A->boundtocpu) {
2295       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2296       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2297     } else {
2298       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2299       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2300     }
2301     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2302     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2303   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2304 
2305   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2306   (*B)->canuseordering = PETSC_TRUE;
2307   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2308   PetscFunctionReturn(PETSC_SUCCESS);
2309 }
2310 
2311 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2312 {
2313   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2314   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2315 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2316   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2317 #endif
2318 
2319   PetscFunctionBegin;
2320   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2321     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2322     if (A->factortype == MAT_FACTOR_NONE) {
2323       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2324       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2325     }
2326 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2327     else if (fs->csrVal) {
2328       /* We have a factorized matrix on device and are able to copy it to host */
2329       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2330     }
2331 #endif
2332     else
2333       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2334     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2335     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2336     A->offloadmask = PETSC_OFFLOAD_BOTH;
2337   }
2338   PetscFunctionReturn(PETSC_SUCCESS);
2339 }
2340 
2341 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2342 {
2343   PetscFunctionBegin;
2344   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2345   *array = ((Mat_SeqAIJ *)A->data)->a;
2346   PetscFunctionReturn(PETSC_SUCCESS);
2347 }
2348 
2349 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2350 {
2351   PetscFunctionBegin;
2352   A->offloadmask = PETSC_OFFLOAD_CPU;
2353   *array         = NULL;
2354   PetscFunctionReturn(PETSC_SUCCESS);
2355 }
2356 
2357 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2358 {
2359   PetscFunctionBegin;
2360   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2361   *array = ((Mat_SeqAIJ *)A->data)->a;
2362   PetscFunctionReturn(PETSC_SUCCESS);
2363 }
2364 
2365 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2366 {
2367   PetscFunctionBegin;
2368   *array = NULL;
2369   PetscFunctionReturn(PETSC_SUCCESS);
2370 }
2371 
2372 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2373 {
2374   PetscFunctionBegin;
2375   *array = ((Mat_SeqAIJ *)A->data)->a;
2376   PetscFunctionReturn(PETSC_SUCCESS);
2377 }
2378 
2379 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2380 {
2381   PetscFunctionBegin;
2382   A->offloadmask = PETSC_OFFLOAD_CPU;
2383   *array         = NULL;
2384   PetscFunctionReturn(PETSC_SUCCESS);
2385 }
2386 
2387 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2388 {
2389   Mat_SeqAIJCUSPARSE *cusp;
2390   CsrMatrix          *matrix;
2391 
2392   PetscFunctionBegin;
2393   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2394   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2395   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2396   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2397   matrix = (CsrMatrix *)cusp->mat->mat;
2398 
2399   if (i) {
2400 #if !defined(PETSC_USE_64BIT_INDICES)
2401     *i = matrix->row_offsets->data().get();
2402 #else
2403     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2404 #endif
2405   }
2406   if (j) {
2407 #if !defined(PETSC_USE_64BIT_INDICES)
2408     *j = matrix->column_indices->data().get();
2409 #else
2410     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2411 #endif
2412   }
2413   if (a) *a = matrix->values->data().get();
2414   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2415   PetscFunctionReturn(PETSC_SUCCESS);
2416 }
2417 
2418 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2419 {
2420   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2421   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2422   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2423   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2424   cusparseStatus_t              stat;
2425   PetscBool                     both = PETSC_TRUE;
2426 
2427   PetscFunctionBegin;
2428   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2429   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2430     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2431       CsrMatrix *matrix;
2432       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2433 
2434       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2435       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2436       matrix->values->assign(a->a, a->a + a->nz);
2437       PetscCallCUDA(WaitForCUDA());
2438       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2439       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2440       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2441     } else {
2442       PetscInt nnz;
2443       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2444       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2445       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2446       delete cusparsestruct->workVector;
2447       delete cusparsestruct->rowoffsets_gpu;
2448       cusparsestruct->workVector     = NULL;
2449       cusparsestruct->rowoffsets_gpu = NULL;
2450       try {
2451         if (a->compressedrow.use) {
2452           m    = a->compressedrow.nrows;
2453           ii   = a->compressedrow.i;
2454           ridx = a->compressedrow.rindex;
2455         } else {
2456           m    = A->rmap->n;
2457           ii   = a->i;
2458           ridx = NULL;
2459         }
2460         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2461         if (!a->a) {
2462           nnz  = ii[m];
2463           both = PETSC_FALSE;
2464         } else nnz = a->nz;
2465         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2466 
2467         /* create cusparse matrix */
2468         cusparsestruct->nrows = m;
2469         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2470         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2471         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2472         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2473 
2474         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2475         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2477         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2481 
2482         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2483         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2484           /* set the matrix */
2485           CsrMatrix *mat   = new CsrMatrix;
2486           mat->num_rows    = m;
2487           mat->num_cols    = A->cmap->n;
2488           mat->num_entries = nnz;
2489           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2490           mat->row_offsets->assign(ii, ii + m + 1);
2491           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2492           mat->column_indices->assign(a->j, a->j + nnz);
2493 
2494           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2495           if (a->a) mat->values->assign(a->a, a->a + nnz);
2496 
2497           /* assign the pointer */
2498           matstruct->mat = mat;
2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503             PetscCallCUSPARSE(stat);
2504           }
2505 #endif
2506         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509 #else
2510           CsrMatrix *mat   = new CsrMatrix;
2511           mat->num_rows    = m;
2512           mat->num_cols    = A->cmap->n;
2513           mat->num_entries = nnz;
2514           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2515           mat->row_offsets->assign(ii, ii + m + 1);
2516 
2517           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2518           mat->column_indices->assign(a->j, a->j + nnz);
2519 
2520           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2521           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522 
2523           cusparseHybMat_t hybMat;
2524           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527           PetscCallCUSPARSE(stat);
2528           /* assign the pointer */
2529           matstruct->mat = hybMat;
2530 
2531           if (mat) {
2532             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535             delete (CsrMatrix *)mat;
2536           }
2537 #endif
2538         }
2539 
2540         /* assign the compressed row indices */
2541         if (a->compressedrow.use) {
2542           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2543           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2544           matstruct->cprowIndices->assign(ridx, ridx + m);
2545           tmp = m;
2546         } else {
2547           cusparsestruct->workVector = NULL;
2548           matstruct->cprowIndices    = NULL;
2549           tmp                        = 0;
2550         }
2551         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2552 
2553         /* assign the pointer */
2554         cusparsestruct->mat = matstruct;
2555       } catch (char *ex) {
2556         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557       }
2558       PetscCallCUDA(WaitForCUDA());
2559       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560       cusparsestruct->nonzerostate = A->nonzerostate;
2561     }
2562     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563   }
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 struct VecCUDAPlusEquals {
2568   template <typename Tuple>
2569   __host__ __device__ void operator()(Tuple t)
2570   {
2571     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572   }
2573 };
2574 
2575 struct VecCUDAEquals {
2576   template <typename Tuple>
2577   __host__ __device__ void operator()(Tuple t)
2578   {
2579     thrust::get<1>(t) = thrust::get<0>(t);
2580   }
2581 };
2582 
2583 struct VecCUDAEqualsReverse {
2584   template <typename Tuple>
2585   __host__ __device__ void operator()(Tuple t)
2586   {
2587     thrust::get<0>(t) = thrust::get<1>(t);
2588   }
2589 };
2590 
2591 struct MatMatCusparse {
2592   PetscBool      cisdense;
2593   PetscScalar   *Bt;
2594   Mat            X;
2595   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596   PetscLogDouble flops;
2597   CsrMatrix     *Bcsr;
2598 
2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600   cusparseSpMatDescr_t matSpBDescr;
2601   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602   cusparseDnMatDescr_t matBDescr;
2603   cusparseDnMatDescr_t matCDescr;
2604   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606   void *dBuffer4;
2607   void *dBuffer5;
2608   #endif
2609   size_t                mmBufferSize;
2610   void                 *mmBuffer;
2611   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612   cusparseSpGEMMDescr_t spgemmDesc;
2613 #endif
2614 };
2615 
2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617 {
2618   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2619 
2620   PetscFunctionBegin;
2621   PetscCallCUDA(cudaFree(mmdata->Bt));
2622   delete mmdata->Bcsr;
2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631   #endif
2632   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634 #endif
2635   PetscCall(MatDestroy(&mmdata->X));
2636   PetscCall(PetscFree(data));
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2641 
2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643 {
2644   Mat_Product                  *product = C->product;
2645   Mat                           A, B;
2646   PetscInt                      m, n, blda, clda;
2647   PetscBool                     flg, biscuda;
2648   Mat_SeqAIJCUSPARSE           *cusp;
2649   cusparseStatus_t              stat;
2650   cusparseOperation_t           opA;
2651   const PetscScalar            *barray;
2652   PetscScalar                  *carray;
2653   MatMatCusparse               *mmdata;
2654   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655   CsrMatrix                    *csrmat;
2656 
2657   PetscFunctionBegin;
2658   MatCheckProduct(C, 1);
2659   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660   mmdata = (MatMatCusparse *)product->data;
2661   A      = product->A;
2662   B      = product->B;
2663   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670   switch (product->type) {
2671   case MATPRODUCT_AB:
2672   case MATPRODUCT_PtAP:
2673     mat = cusp->mat;
2674     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675     m   = A->rmap->n;
2676     n   = B->cmap->n;
2677     break;
2678   case MATPRODUCT_AtB:
2679     if (!A->form_explicit_transpose) {
2680       mat = cusp->mat;
2681       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682     } else {
2683       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684       mat = cusp->matTranspose;
2685       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686     }
2687     m = A->cmap->n;
2688     n = B->cmap->n;
2689     break;
2690   case MATPRODUCT_ABt:
2691   case MATPRODUCT_RARt:
2692     mat = cusp->mat;
2693     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694     m   = A->rmap->n;
2695     n   = B->rmap->n;
2696     break;
2697   default:
2698     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699   }
2700   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701   csrmat = (CsrMatrix *)mat->mat;
2702   /* if the user passed a CPU matrix, copy the data to the GPU */
2703   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2706 
2707   PetscCall(MatDenseGetLDA(B, &blda));
2708   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711   } else {
2712     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713     PetscCall(MatDenseGetLDA(C, &clda));
2714   }
2715 
2716   PetscCall(PetscLogGpuTimeBegin());
2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2720   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2721   #else
2722   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2723   #endif
2724 
2725   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2726   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2727     size_t mmBufferSize;
2728     if (mmdata->initialized && mmdata->Blda != blda) {
2729       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2730       mmdata->matBDescr = NULL;
2731     }
2732     if (!mmdata->matBDescr) {
2733       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2734       mmdata->Blda = blda;
2735     }
2736 
2737     if (mmdata->initialized && mmdata->Clda != clda) {
2738       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2739       mmdata->matCDescr = NULL;
2740     }
2741     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2742       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2743       mmdata->Clda = clda;
2744     }
2745 
2746   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2747     if (matADescr) {
2748       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2749       matADescr = NULL;
2750     }
2751   #endif
2752 
2753     if (!matADescr) {
2754       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2755                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2756       PetscCallCUSPARSE(stat);
2757     }
2758 
2759     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2760 
2761     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2762       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2763       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2764       mmdata->mmBufferSize = mmBufferSize;
2765     }
2766 
2767   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2768     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2769   #endif
2770 
2771     mmdata->initialized = PETSC_TRUE;
2772   } else {
2773     /* to be safe, always update pointers of the mats */
2774     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2775     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2776     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2777   }
2778 
2779   /* do cusparseSpMM, which supports transpose on B */
2780   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2781 #else
2782   PetscInt k;
2783   /* cusparseXcsrmm does not support transpose on B */
2784   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2785     cublasHandle_t cublasv2handle;
2786     cublasStatus_t cerr;
2787 
2788     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2789     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2790     PetscCallCUBLAS(cerr);
2791     blda = B->cmap->n;
2792     k    = B->cmap->n;
2793   } else {
2794     k = B->rmap->n;
2795   }
2796 
2797   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2798   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2799   PetscCallCUSPARSE(stat);
2800 #endif
2801   PetscCall(PetscLogGpuTimeEnd());
2802   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2803   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2804   if (product->type == MATPRODUCT_RARt) {
2805     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2806     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2807   } else if (product->type == MATPRODUCT_PtAP) {
2808     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2809     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2810   } else {
2811     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2812   }
2813   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2814   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2815   PetscFunctionReturn(PETSC_SUCCESS);
2816 }
2817 
2818 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2819 {
2820   Mat_Product        *product = C->product;
2821   Mat                 A, B;
2822   PetscInt            m, n;
2823   PetscBool           cisdense, flg;
2824   MatMatCusparse     *mmdata;
2825   Mat_SeqAIJCUSPARSE *cusp;
2826 
2827   PetscFunctionBegin;
2828   MatCheckProduct(C, 1);
2829   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2830   A = product->A;
2831   B = product->B;
2832   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2833   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2834   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2835   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2836   switch (product->type) {
2837   case MATPRODUCT_AB:
2838     m = A->rmap->n;
2839     n = B->cmap->n;
2840     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2841     break;
2842   case MATPRODUCT_AtB:
2843     m = A->cmap->n;
2844     n = B->cmap->n;
2845     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2846     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2847     break;
2848   case MATPRODUCT_ABt:
2849     m = A->rmap->n;
2850     n = B->rmap->n;
2851     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2852     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2853     break;
2854   case MATPRODUCT_PtAP:
2855     m = B->cmap->n;
2856     n = B->cmap->n;
2857     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2858     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2859     break;
2860   case MATPRODUCT_RARt:
2861     m = B->rmap->n;
2862     n = B->rmap->n;
2863     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2864     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2865     break;
2866   default:
2867     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2868   }
2869   PetscCall(MatSetSizes(C, m, n, m, n));
2870   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2871   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2872   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2873 
2874   /* product data */
2875   PetscCall(PetscNew(&mmdata));
2876   mmdata->cisdense = cisdense;
2877 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2878   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2879   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2880 #endif
2881   /* for these products we need intermediate storage */
2882   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2883     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2884     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2885     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2886       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2887     } else {
2888       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2889     }
2890   }
2891   C->product->data    = mmdata;
2892   C->product->destroy = MatDestroy_MatMatCusparse;
2893 
2894   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2895   PetscFunctionReturn(PETSC_SUCCESS);
2896 }
2897 
2898 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2899 {
2900   Mat_Product                  *product = C->product;
2901   Mat                           A, B;
2902   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2903   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2904   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2905   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2906   PetscBool                     flg;
2907   cusparseStatus_t              stat;
2908   MatProductType                ptype;
2909   MatMatCusparse               *mmdata;
2910 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2911   cusparseSpMatDescr_t BmatSpDescr;
2912 #endif
2913   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2914 
2915   PetscFunctionBegin;
2916   MatCheckProduct(C, 1);
2917   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2918   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2919   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2920   mmdata = (MatMatCusparse *)C->product->data;
2921   A      = product->A;
2922   B      = product->B;
2923   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2924     mmdata->reusesym = PETSC_FALSE;
2925     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2926     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2927     Cmat = Ccusp->mat;
2928     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2929     Ccsr = (CsrMatrix *)Cmat->mat;
2930     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2931     goto finalize;
2932   }
2933   if (!c->nz) goto finalize;
2934   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2935   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2936   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2937   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2938   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2939   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2940   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2941   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2942   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2943   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2944   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2945   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2946   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2947   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2948 
2949   ptype = product->type;
2950   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2951     ptype = MATPRODUCT_AB;
2952     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2953   }
2954   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2955     ptype = MATPRODUCT_AB;
2956     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2957   }
2958   switch (ptype) {
2959   case MATPRODUCT_AB:
2960     Amat = Acusp->mat;
2961     Bmat = Bcusp->mat;
2962     break;
2963   case MATPRODUCT_AtB:
2964     Amat = Acusp->matTranspose;
2965     Bmat = Bcusp->mat;
2966     break;
2967   case MATPRODUCT_ABt:
2968     Amat = Acusp->mat;
2969     Bmat = Bcusp->matTranspose;
2970     break;
2971   default:
2972     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2973   }
2974   Cmat = Ccusp->mat;
2975   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2976   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2977   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2978   Acsr = (CsrMatrix *)Amat->mat;
2979   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2980   Ccsr = (CsrMatrix *)Cmat->mat;
2981   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2982   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2983   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2984   PetscCall(PetscLogGpuTimeBegin());
2985 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2986   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2987   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2988   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2989   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2990   PetscCallCUSPARSE(stat);
2991   #else
2992   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2993   PetscCallCUSPARSE(stat);
2994   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2995   PetscCallCUSPARSE(stat);
2996   #endif
2997 #else
2998   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2999                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3000   PetscCallCUSPARSE(stat);
3001 #endif
3002   PetscCall(PetscLogGpuFlops(mmdata->flops));
3003   PetscCallCUDA(WaitForCUDA());
3004   PetscCall(PetscLogGpuTimeEnd());
3005   C->offloadmask = PETSC_OFFLOAD_GPU;
3006 finalize:
3007   /* shorter version of MatAssemblyEnd_SeqAIJ */
3008   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3009   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3010   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3011   c->reallocs = 0;
3012   C->info.mallocs += 0;
3013   C->info.nz_unneeded = 0;
3014   C->assembled = C->was_assembled = PETSC_TRUE;
3015   C->num_ass++;
3016   PetscFunctionReturn(PETSC_SUCCESS);
3017 }
3018 
3019 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3020 {
3021   Mat_Product                  *product = C->product;
3022   Mat                           A, B;
3023   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3024   Mat_SeqAIJ                   *a, *b, *c;
3025   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3026   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3027   PetscInt                      i, j, m, n, k;
3028   PetscBool                     flg;
3029   cusparseStatus_t              stat;
3030   MatProductType                ptype;
3031   MatMatCusparse               *mmdata;
3032   PetscLogDouble                flops;
3033   PetscBool                     biscompressed, ciscompressed;
3034 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3035   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3036   cusparseSpMatDescr_t BmatSpDescr;
3037 #else
3038   int cnz;
3039 #endif
3040   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3041 
3042   PetscFunctionBegin;
3043   MatCheckProduct(C, 1);
3044   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3045   A = product->A;
3046   B = product->B;
3047   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3048   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3049   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3050   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3051   a = (Mat_SeqAIJ *)A->data;
3052   b = (Mat_SeqAIJ *)B->data;
3053   /* product data */
3054   PetscCall(PetscNew(&mmdata));
3055   C->product->data    = mmdata;
3056   C->product->destroy = MatDestroy_MatMatCusparse;
3057 
3058   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3059   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3060   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3061   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3062   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3063   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3064 
3065   ptype = product->type;
3066   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3067     ptype                                          = MATPRODUCT_AB;
3068     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3069   }
3070   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3071     ptype                                          = MATPRODUCT_AB;
3072     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3073   }
3074   biscompressed = PETSC_FALSE;
3075   ciscompressed = PETSC_FALSE;
3076   switch (ptype) {
3077   case MATPRODUCT_AB:
3078     m    = A->rmap->n;
3079     n    = B->cmap->n;
3080     k    = A->cmap->n;
3081     Amat = Acusp->mat;
3082     Bmat = Bcusp->mat;
3083     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3084     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3085     break;
3086   case MATPRODUCT_AtB:
3087     m = A->cmap->n;
3088     n = B->cmap->n;
3089     k = A->rmap->n;
3090     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3091     Amat = Acusp->matTranspose;
3092     Bmat = Bcusp->mat;
3093     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3094     break;
3095   case MATPRODUCT_ABt:
3096     m = A->rmap->n;
3097     n = B->rmap->n;
3098     k = A->cmap->n;
3099     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3100     Amat = Acusp->mat;
3101     Bmat = Bcusp->matTranspose;
3102     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3103     break;
3104   default:
3105     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3106   }
3107 
3108   /* create cusparse matrix */
3109   PetscCall(MatSetSizes(C, m, n, m, n));
3110   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3111   c     = (Mat_SeqAIJ *)C->data;
3112   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3113   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3114   Ccsr  = new CsrMatrix;
3115 
3116   c->compressedrow.use = ciscompressed;
3117   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3118     c->compressedrow.nrows = a->compressedrow.nrows;
3119     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3120     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3121     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3122     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3123     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3124   } else {
3125     c->compressedrow.nrows  = 0;
3126     c->compressedrow.i      = NULL;
3127     c->compressedrow.rindex = NULL;
3128     Ccusp->workVector       = NULL;
3129     Cmat->cprowIndices      = NULL;
3130   }
3131   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3132   Ccusp->mat        = Cmat;
3133   Ccusp->mat->mat   = Ccsr;
3134   Ccsr->num_rows    = Ccusp->nrows;
3135   Ccsr->num_cols    = n;
3136   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3137   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3138   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3139   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3140   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3141   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3142   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3143   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3144   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3145   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3146   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3147     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3148     c->nz                = 0;
3149     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3150     Ccsr->values         = new THRUSTARRAY(c->nz);
3151     goto finalizesym;
3152   }
3153 
3154   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3155   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3156   Acsr = (CsrMatrix *)Amat->mat;
3157   if (!biscompressed) {
3158     Bcsr = (CsrMatrix *)Bmat->mat;
3159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3160     BmatSpDescr = Bmat->matDescr;
3161 #endif
3162   } else { /* we need to use row offsets for the full matrix */
3163     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3164     Bcsr                 = new CsrMatrix;
3165     Bcsr->num_rows       = B->rmap->n;
3166     Bcsr->num_cols       = cBcsr->num_cols;
3167     Bcsr->num_entries    = cBcsr->num_entries;
3168     Bcsr->column_indices = cBcsr->column_indices;
3169     Bcsr->values         = cBcsr->values;
3170     if (!Bcusp->rowoffsets_gpu) {
3171       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3172       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3173       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3174     }
3175     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3176     mmdata->Bcsr      = Bcsr;
3177 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3178     if (Bcsr->num_rows && Bcsr->num_cols) {
3179       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3180       PetscCallCUSPARSE(stat);
3181     }
3182     BmatSpDescr = mmdata->matSpBDescr;
3183 #endif
3184   }
3185   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3186   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3187   /* precompute flops count */
3188   if (ptype == MATPRODUCT_AB) {
3189     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3190       const PetscInt st = a->i[i];
3191       const PetscInt en = a->i[i + 1];
3192       for (j = st; j < en; j++) {
3193         const PetscInt brow = a->j[j];
3194         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3195       }
3196     }
3197   } else if (ptype == MATPRODUCT_AtB) {
3198     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3199       const PetscInt anzi = a->i[i + 1] - a->i[i];
3200       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3201       flops += (2. * anzi) * bnzi;
3202     }
3203   } else { /* TODO */
3204     flops = 0.;
3205   }
3206 
3207   mmdata->flops = flops;
3208   PetscCall(PetscLogGpuTimeBegin());
3209 
3210 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3211   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3212   // cuda-12.2 requires non-null csrRowOffsets
3213   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3214   PetscCallCUSPARSE(stat);
3215   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3216   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3217   {
3218     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3219      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3220   */
3221     void *dBuffer1 = NULL;
3222     void *dBuffer2 = NULL;
3223     void *dBuffer3 = NULL;
3224     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3225     size_t bufferSize1 = 0;
3226     size_t bufferSize2 = 0;
3227     size_t bufferSize3 = 0;
3228     size_t bufferSize4 = 0;
3229     size_t bufferSize5 = 0;
3230 
3231     /* ask bufferSize1 bytes for external memory */
3232     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3233     PetscCallCUSPARSE(stat);
3234     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3235     /* inspect the matrices A and B to understand the memory requirement for the next step */
3236     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3237     PetscCallCUSPARSE(stat);
3238 
3239     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3240     PetscCallCUSPARSE(stat);
3241     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3242     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3243     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3244     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3245     PetscCallCUSPARSE(stat);
3246     PetscCallCUDA(cudaFree(dBuffer1));
3247     PetscCallCUDA(cudaFree(dBuffer2));
3248 
3249     /* get matrix C non-zero entries C_nnz1 */
3250     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3251     c->nz = (PetscInt)C_nnz1;
3252     /* allocate matrix C */
3253     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3254     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3255     Ccsr->values = new THRUSTARRAY(c->nz);
3256     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3257     /* update matC with the new pointers */
3258     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3259     PetscCallCUSPARSE(stat);
3260 
3261     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3262     PetscCallCUSPARSE(stat);
3263     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3264     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3265     PetscCallCUSPARSE(stat);
3266     PetscCallCUDA(cudaFree(dBuffer3));
3267     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3268     PetscCallCUSPARSE(stat);
3269     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3270   }
3271   #else
3272   size_t bufSize2;
3273   /* ask bufferSize bytes for external memory */
3274   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3275   PetscCallCUSPARSE(stat);
3276   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3277   /* inspect the matrices A and B to understand the memory requirement for the next step */
3278   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3279   PetscCallCUSPARSE(stat);
3280   /* ask bufferSize again bytes for external memory */
3281   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3282   PetscCallCUSPARSE(stat);
3283   /* The CUSPARSE documentation is not clear, nor the API
3284      We need both buffers to perform the operations properly!
3285      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3286      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3287      is stored in the descriptor! What a messy API... */
3288   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3289   /* compute the intermediate product of A * B */
3290   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3291   PetscCallCUSPARSE(stat);
3292   /* get matrix C non-zero entries C_nnz1 */
3293   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3294   c->nz = (PetscInt)C_nnz1;
3295   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3296                       mmdata->mmBufferSize / 1024));
3297   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3298   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3299   Ccsr->values = new THRUSTARRAY(c->nz);
3300   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3301   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3302   PetscCallCUSPARSE(stat);
3303   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3304   PetscCallCUSPARSE(stat);
3305   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3306 #else
3307   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3308   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3309                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3310   PetscCallCUSPARSE(stat);
3311   c->nz                = cnz;
3312   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3313   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3314   Ccsr->values = new THRUSTARRAY(c->nz);
3315   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3316 
3317   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3318   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3319      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3320      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3321   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3322                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3323   PetscCallCUSPARSE(stat);
3324 #endif
3325   PetscCall(PetscLogGpuFlops(mmdata->flops));
3326   PetscCall(PetscLogGpuTimeEnd());
3327 finalizesym:
3328   c->free_a = PETSC_TRUE;
3329   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3330   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3331   c->free_ij = PETSC_TRUE;
3332   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3333     PetscInt      *d_i = c->i;
3334     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3335     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3336     ii = *Ccsr->row_offsets;
3337     jj = *Ccsr->column_indices;
3338     if (ciscompressed) d_i = c->compressedrow.i;
3339     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3340     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3341   } else {
3342     PetscInt *d_i = c->i;
3343     if (ciscompressed) d_i = c->compressedrow.i;
3344     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3345     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346   }
3347   if (ciscompressed) { /* need to expand host row offsets */
3348     PetscInt r = 0;
3349     c->i[0]    = 0;
3350     for (k = 0; k < c->compressedrow.nrows; k++) {
3351       const PetscInt next = c->compressedrow.rindex[k];
3352       const PetscInt old  = c->compressedrow.i[k];
3353       for (; r < next; r++) c->i[r + 1] = old;
3354     }
3355     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3356   }
3357   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3358   PetscCall(PetscMalloc1(m, &c->ilen));
3359   PetscCall(PetscMalloc1(m, &c->imax));
3360   c->maxnz         = c->nz;
3361   c->nonzerorowcnt = 0;
3362   c->rmax          = 0;
3363   for (k = 0; k < m; k++) {
3364     const PetscInt nn = c->i[k + 1] - c->i[k];
3365     c->ilen[k] = c->imax[k] = nn;
3366     c->nonzerorowcnt += (PetscInt)!!nn;
3367     c->rmax = PetscMax(c->rmax, nn);
3368   }
3369   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3370   PetscCall(PetscMalloc1(c->nz, &c->a));
3371   Ccsr->num_entries = c->nz;
3372 
3373   C->nonzerostate++;
3374   PetscCall(PetscLayoutSetUp(C->rmap));
3375   PetscCall(PetscLayoutSetUp(C->cmap));
3376   Ccusp->nonzerostate = C->nonzerostate;
3377   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3378   C->preallocated     = PETSC_TRUE;
3379   C->assembled        = PETSC_FALSE;
3380   C->was_assembled    = PETSC_FALSE;
3381   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3382     mmdata->reusesym = PETSC_TRUE;
3383     C->offloadmask   = PETSC_OFFLOAD_GPU;
3384   }
3385   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3386   PetscFunctionReturn(PETSC_SUCCESS);
3387 }
3388 
3389 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3390 
3391 /* handles sparse or dense B */
3392 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3393 {
3394   Mat_Product *product = mat->product;
3395   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3396 
3397   PetscFunctionBegin;
3398   MatCheckProduct(mat, 1);
3399   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3400   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3401   if (product->type == MATPRODUCT_ABC) {
3402     Ciscusp = PETSC_FALSE;
3403     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3404   }
3405   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3406     PetscBool usecpu = PETSC_FALSE;
3407     switch (product->type) {
3408     case MATPRODUCT_AB:
3409       if (product->api_user) {
3410         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3411         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3412         PetscOptionsEnd();
3413       } else {
3414         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3415         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416         PetscOptionsEnd();
3417       }
3418       break;
3419     case MATPRODUCT_AtB:
3420       if (product->api_user) {
3421         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3422         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3423         PetscOptionsEnd();
3424       } else {
3425         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3426         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427         PetscOptionsEnd();
3428       }
3429       break;
3430     case MATPRODUCT_PtAP:
3431       if (product->api_user) {
3432         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3433         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3434         PetscOptionsEnd();
3435       } else {
3436         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3437         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438         PetscOptionsEnd();
3439       }
3440       break;
3441     case MATPRODUCT_RARt:
3442       if (product->api_user) {
3443         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3444         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3445         PetscOptionsEnd();
3446       } else {
3447         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3448         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449         PetscOptionsEnd();
3450       }
3451       break;
3452     case MATPRODUCT_ABC:
3453       if (product->api_user) {
3454         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3455         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3456         PetscOptionsEnd();
3457       } else {
3458         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3459         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460         PetscOptionsEnd();
3461       }
3462       break;
3463     default:
3464       break;
3465     }
3466     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3467   }
3468   /* dispatch */
3469   if (isdense) {
3470     switch (product->type) {
3471     case MATPRODUCT_AB:
3472     case MATPRODUCT_AtB:
3473     case MATPRODUCT_ABt:
3474     case MATPRODUCT_PtAP:
3475     case MATPRODUCT_RARt:
3476       if (product->A->boundtocpu) {
3477         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3478       } else {
3479         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3480       }
3481       break;
3482     case MATPRODUCT_ABC:
3483       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3484       break;
3485     default:
3486       break;
3487     }
3488   } else if (Biscusp && Ciscusp) {
3489     switch (product->type) {
3490     case MATPRODUCT_AB:
3491     case MATPRODUCT_AtB:
3492     case MATPRODUCT_ABt:
3493       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3494       break;
3495     case MATPRODUCT_PtAP:
3496     case MATPRODUCT_RARt:
3497     case MATPRODUCT_ABC:
3498       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3499       break;
3500     default:
3501       break;
3502     }
3503   } else { /* fallback for AIJ */
3504     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3505   }
3506   PetscFunctionReturn(PETSC_SUCCESS);
3507 }
3508 
3509 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510 {
3511   PetscFunctionBegin;
3512   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3513   PetscFunctionReturn(PETSC_SUCCESS);
3514 }
3515 
3516 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3517 {
3518   PetscFunctionBegin;
3519   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3520   PetscFunctionReturn(PETSC_SUCCESS);
3521 }
3522 
3523 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3524 {
3525   PetscFunctionBegin;
3526   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3527   PetscFunctionReturn(PETSC_SUCCESS);
3528 }
3529 
3530 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3531 {
3532   PetscFunctionBegin;
3533   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3534   PetscFunctionReturn(PETSC_SUCCESS);
3535 }
3536 
3537 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538 {
3539   PetscFunctionBegin;
3540   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3541   PetscFunctionReturn(PETSC_SUCCESS);
3542 }
3543 
3544 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3545 {
3546   int i = blockIdx.x * blockDim.x + threadIdx.x;
3547   if (i < n) y[idx[i]] += x[i];
3548 }
3549 
3550 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3551 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3552 {
3553   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3554   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3555   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3556   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3557   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3558   PetscBool                     compressed;
3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560   PetscInt nx, ny;
3561 #endif
3562 
3563   PetscFunctionBegin;
3564   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3565   if (!a->nz) {
3566     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3567     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3568     PetscFunctionReturn(PETSC_SUCCESS);
3569   }
3570   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3571   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3572   if (!trans) {
3573     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3574     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3575   } else {
3576     if (herm || !A->form_explicit_transpose) {
3577       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3578       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579     } else {
3580       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3581       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3582     }
3583   }
3584   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3585   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3586 
3587   try {
3588     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3589     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3590     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3591 
3592     PetscCall(PetscLogGpuTimeBegin());
3593     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3594       /* z = A x + beta y.
3595          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3596          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3597       */
3598       xptr = xarray;
3599       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3600       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3603           allocated to accommodate different uses. So we get the length info directly from mat.
3604        */
3605       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3606         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3607         nx             = mat->num_cols; // since y = Ax
3608         ny             = mat->num_rows;
3609       }
3610 #endif
3611     } else {
3612       /* z = A^T x + beta y
3613          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3614          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3615        */
3616       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3617       dptr = zarray;
3618       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3619       if (compressed) { /* Scatter x to work vector */
3620         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3621 
3622         thrust::for_each(
3623 #if PetscDefined(HAVE_THRUST_ASYNC)
3624           thrust::cuda::par.on(PetscDefaultCudaStream),
3625 #endif
3626           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3627           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3628       }
3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632         nx             = mat->num_rows; // since y = A^T x
3633         ny             = mat->num_cols;
3634       }
3635 #endif
3636     }
3637 
3638     /* csr_spmv does y = alpha op(A) x + beta y */
3639     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3642       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3643   #else
3644       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3645   #endif
3646 
3647       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3648   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3649       if (!matDescr) {
3650         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3651         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3652       }
3653   #endif
3654 
3655       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3656         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3657         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3658         PetscCallCUSPARSE(
3659           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3660         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3661   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3662         PetscCallCUSPARSE(
3663           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664   #endif
3665         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3666       } else {
3667         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3668         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3669         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3670       }
3671 
3672       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3673 #else
3674       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3675       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3676 #endif
3677     } else {
3678       if (cusparsestruct->nrows) {
3679 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3680         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681 #else
3682         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3683         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3684 #endif
3685       }
3686     }
3687     PetscCall(PetscLogGpuTimeEnd());
3688 
3689     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3690       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3691         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3692           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3693         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3694           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3695         }
3696       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3697         PetscCall(VecSeq_CUDA::Set(zz, 0));
3698       }
3699 
3700       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3701       if (compressed) {
3702         PetscCall(PetscLogGpuTimeBegin());
3703         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3704         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3705         PetscCall(PetscLogGpuTimeEnd());
3706       }
3707     } else {
3708       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3709     }
3710     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3711     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3712     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3713   } catch (char *ex) {
3714     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3715   }
3716   if (yy) {
3717     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3718   } else {
3719     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3720   }
3721   PetscFunctionReturn(PETSC_SUCCESS);
3722 }
3723 
3724 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3725 {
3726   PetscFunctionBegin;
3727   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3728   PetscFunctionReturn(PETSC_SUCCESS);
3729 }
3730 
3731 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
3732 
3733 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3734 {
3735   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
3736 
3737   if (x < len) {
3738     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3739     PetscScalar    d = 0.0;
3740 
3741     for (PetscInt i = 0; i < num_non0_row; i++) {
3742       if (col[i + rowx] == x) {
3743         d = val[i + rowx];
3744         break;
3745       }
3746     }
3747     diag[x] = d;
3748   }
3749 }
3750 
3751 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3752 {
3753   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3754   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3755   PetscScalar                  *darray;
3756 
3757   PetscFunctionBegin;
3758   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3759     PetscInt   n   = A->rmap->n;
3760     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3761 
3762     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3763     if (n > 0) {
3764       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3765       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3766       PetscCallCUDA(cudaPeekAtLastError());
3767       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3768     }
3769   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3770   PetscFunctionReturn(PETSC_SUCCESS);
3771 }
3772 
3773 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3774 {
3775   PetscFunctionBegin;
3776   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3777   PetscFunctionReturn(PETSC_SUCCESS);
3778 }
3779 
3780 /*@
3781   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3782 
3783   Collective
3784 
3785   Input Parameters:
3786 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3787 . m    - number of rows
3788 . n    - number of columns
3789 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3790 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3791 
3792   Output Parameter:
3793 . A - the matrix
3794 
3795   Level: intermediate
3796 
3797   Notes:
3798   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3799   calculations. For good matrix assembly performance the user should preallocate the matrix
3800   storage by setting the parameter `nz` (or the array `nnz`).
3801 
3802   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3803   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3804   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3805 
3806   The AIJ format, also called
3807   compressed row storage, is fully compatible with standard Fortran
3808   storage.  That is, the stored row and column indices can begin at
3809   either one (as in Fortran) or zero.
3810 
3811   Specify the preallocated storage with either nz or nnz (not both).
3812   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3813   allocation.
3814 
3815   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3816 
3817 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3818           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3819 @*/
3820 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3821 {
3822   PetscFunctionBegin;
3823   PetscCall(MatCreate(comm, A));
3824   PetscCall(MatSetSizes(*A, m, n, m, n));
3825   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3826   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3827   PetscFunctionReturn(PETSC_SUCCESS);
3828 }
3829 
3830 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3831 {
3832   PetscFunctionBegin;
3833   if (A->factortype == MAT_FACTOR_NONE) {
3834     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3835   } else {
3836     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3837   }
3838   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3839   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3840   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3841   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3842   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3843   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3844   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3845   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3846   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3847   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3848   PetscCall(MatDestroy_SeqAIJ(A));
3849   PetscFunctionReturn(PETSC_SUCCESS);
3850 }
3851 
3852 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3853 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3854 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3855 {
3856   PetscFunctionBegin;
3857   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3858   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3859   PetscFunctionReturn(PETSC_SUCCESS);
3860 }
3861 
3862 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3863 {
3864   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3865   Mat_SeqAIJCUSPARSE *cy;
3866   Mat_SeqAIJCUSPARSE *cx;
3867   PetscScalar        *ay;
3868   const PetscScalar  *ax;
3869   CsrMatrix          *csry, *csrx;
3870 
3871   PetscFunctionBegin;
3872   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3873   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3874   if (X->ops->axpy != Y->ops->axpy) {
3875     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3876     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3877     PetscFunctionReturn(PETSC_SUCCESS);
3878   }
3879   /* if we are here, it means both matrices are bound to GPU */
3880   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3881   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3882   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3883   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3884   csry = (CsrMatrix *)cy->mat->mat;
3885   csrx = (CsrMatrix *)cx->mat->mat;
3886   /* see if we can turn this into a cublas axpy */
3887   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3888     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3889     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3890     if (eq) str = SAME_NONZERO_PATTERN;
3891   }
3892   /* spgeam is buggy with one column */
3893   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3894 
3895   if (str == SUBSET_NONZERO_PATTERN) {
3896     PetscScalar b = 1.0;
3897 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3898     size_t bufferSize;
3899     void  *buffer;
3900 #endif
3901 
3902     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3903     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3904     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3905 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3906     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3907                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3908     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3909     PetscCall(PetscLogGpuTimeBegin());
3910     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3911                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3912     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3913     PetscCall(PetscLogGpuTimeEnd());
3914     PetscCallCUDA(cudaFree(buffer));
3915 #else
3916     PetscCall(PetscLogGpuTimeBegin());
3917     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3918                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3919     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3920     PetscCall(PetscLogGpuTimeEnd());
3921 #endif
3922     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3923     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3924     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3925     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3926   } else if (str == SAME_NONZERO_PATTERN) {
3927     cublasHandle_t cublasv2handle;
3928     PetscBLASInt   one = 1, bnz = 1;
3929 
3930     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3931     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3932     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3933     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3934     PetscCall(PetscLogGpuTimeBegin());
3935     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3936     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3937     PetscCall(PetscLogGpuTimeEnd());
3938     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3939     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3940     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3941   } else {
3942     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3943     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3944   }
3945   PetscFunctionReturn(PETSC_SUCCESS);
3946 }
3947 
3948 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3949 {
3950   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3951   PetscScalar   *ay;
3952   cublasHandle_t cublasv2handle;
3953   PetscBLASInt   one = 1, bnz = 1;
3954 
3955   PetscFunctionBegin;
3956   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3957   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3958   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3959   PetscCall(PetscLogGpuTimeBegin());
3960   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3961   PetscCall(PetscLogGpuFlops(bnz));
3962   PetscCall(PetscLogGpuTimeEnd());
3963   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3964   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3965   PetscFunctionReturn(PETSC_SUCCESS);
3966 }
3967 
3968 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3969 {
3970   PetscBool   gpu = PETSC_FALSE;
3971   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;
3972 
3973   PetscFunctionBegin;
3974   if (A->factortype == MAT_FACTOR_NONE) {
3975     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3976     if (spptr->mat) {
3977       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3978       if (matrix->values) {
3979         gpu = PETSC_TRUE;
3980         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3981       }
3982     }
3983     if (spptr->matTranspose) {
3984       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3985       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3986     }
3987   }
3988   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3989   else {
3990     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3991     A->offloadmask = PETSC_OFFLOAD_CPU;
3992   }
3993   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3994   PetscFunctionReturn(PETSC_SUCCESS);
3995 }
3996 
3997 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3998 {
3999   PetscFunctionBegin;
4000   *m = PETSC_MEMTYPE_CUDA;
4001   PetscFunctionReturn(PETSC_SUCCESS);
4002 }
4003 
4004 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4005 {
4006   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4007 
4008   PetscFunctionBegin;
4009   if (A->factortype != MAT_FACTOR_NONE) {
4010     A->boundtocpu = flg;
4011     PetscFunctionReturn(PETSC_SUCCESS);
4012   }
4013   if (flg) {
4014     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4015 
4016     A->ops->scale                     = MatScale_SeqAIJ;
4017     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4018     A->ops->axpy                      = MatAXPY_SeqAIJ;
4019     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4020     A->ops->mult                      = MatMult_SeqAIJ;
4021     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4022     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4023     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4024     A->ops->multhermitiantranspose    = NULL;
4025     A->ops->multhermitiantransposeadd = NULL;
4026     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4027     A->ops->getcurrentmemtype         = NULL;
4028     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4029     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4030     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4031     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4032     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4033     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4034     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4035   } else {
4036     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4037     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4038     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4039     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4040     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4041     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4042     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4043     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4044     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4045     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4046     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4047     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4048     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4049     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4050     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4051     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4052     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4053     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4054     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4055 
4056     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4057     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4059     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4060     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4061     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4062   }
4063   A->boundtocpu = flg;
4064   if (flg && a->inode.size_csr) {
4065     a->inode.use = PETSC_TRUE;
4066   } else {
4067     a->inode.use = PETSC_FALSE;
4068   }
4069   PetscFunctionReturn(PETSC_SUCCESS);
4070 }
4071 
4072 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4073 {
4074   Mat B;
4075 
4076   PetscFunctionBegin;
4077   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4078   if (reuse == MAT_INITIAL_MATRIX) {
4079     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4080   } else if (reuse == MAT_REUSE_MATRIX) {
4081     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4082   }
4083   B = *newmat;
4084 
4085   PetscCall(PetscFree(B->defaultvectype));
4086   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4087 
4088   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4089     if (B->factortype == MAT_FACTOR_NONE) {
4090       Mat_SeqAIJCUSPARSE *spptr;
4091       PetscCall(PetscNew(&spptr));
4092       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4093       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4094       spptr->format = MAT_CUSPARSE_CSR;
4095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4096   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4097       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4098   #else
4099       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4100   #endif
4101       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4102       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4103 #endif
4104       B->spptr = spptr;
4105     } else {
4106       Mat_SeqAIJCUSPARSETriFactors *spptr;
4107 
4108       PetscCall(PetscNew(&spptr));
4109       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4110       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4111       B->spptr = spptr;
4112     }
4113     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4114   }
4115   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4116   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4117   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4118   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4119   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4120   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4121   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4122 
4123   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4124   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4125   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4126 #if defined(PETSC_HAVE_HYPRE)
4127   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4128 #endif
4129   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4130   PetscFunctionReturn(PETSC_SUCCESS);
4131 }
4132 
4133 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4134 {
4135   PetscFunctionBegin;
4136   PetscCall(MatCreate_SeqAIJ(B));
4137   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4138   PetscFunctionReturn(PETSC_SUCCESS);
4139 }
4140 
4141 /*MC
4142    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4143 
4144    Options Database Keys:
4145 +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4146 .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4147                                            Other options include ell (ellpack) or hyb (hybrid).
4148 .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4149 -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4150 
4151   Level: beginner
4152 
4153   Notes:
4154   These matrices can be in either CSR, ELL, or HYB format.
4155 
4156   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4157 
4158   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4159   if some integer values passed in do not fit in `int`.
4160 
4161 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4162 M*/
4163 
4164 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4165 {
4166   PetscFunctionBegin;
4167   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4168   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4169   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4170   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4171   PetscFunctionReturn(PETSC_SUCCESS);
4172 }
4173 
4174 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4175 {
4176   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4177 
4178   PetscFunctionBegin;
4179   if (cusp) {
4180     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4181     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4182     delete cusp->workVector;
4183     delete cusp->rowoffsets_gpu;
4184     delete cusp->csr2csc_i;
4185     delete cusp->coords;
4186     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4187     PetscCall(PetscFree(mat->spptr));
4188   }
4189   PetscFunctionReturn(PETSC_SUCCESS);
4190 }
4191 
4192 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4193 {
4194   PetscFunctionBegin;
4195   if (*mat) {
4196     delete (*mat)->values;
4197     delete (*mat)->column_indices;
4198     delete (*mat)->row_offsets;
4199     delete *mat;
4200     *mat = 0;
4201   }
4202   PetscFunctionReturn(PETSC_SUCCESS);
4203 }
4204 
4205 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4206 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4207 {
4208   PetscFunctionBegin;
4209   if (*trifactor) {
4210     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4211     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4212     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4213     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4214     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4215   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4217   #endif
4218     PetscCall(PetscFree(*trifactor));
4219   }
4220   PetscFunctionReturn(PETSC_SUCCESS);
4221 }
4222 #endif
4223 
4224 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4225 {
4226   CsrMatrix *mat;
4227 
4228   PetscFunctionBegin;
4229   if (*matstruct) {
4230     if ((*matstruct)->mat) {
4231       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4232 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4233         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4234 #else
4235         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4236         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4237 #endif
4238       } else {
4239         mat = (CsrMatrix *)(*matstruct)->mat;
4240         PetscCall(CsrMatrix_Destroy(&mat));
4241       }
4242     }
4243     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4244     delete (*matstruct)->cprowIndices;
4245     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4246     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4247     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4248 
4249 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4250     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4251     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4252 
4253     for (int i = 0; i < 3; i++) {
4254       if (mdata->cuSpMV[i].initialized) {
4255         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4256         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4257         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4258   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4259         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4260         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4261   #endif
4262       }
4263     }
4264 #endif
4265     delete *matstruct;
4266     *matstruct = NULL;
4267   }
4268   PetscFunctionReturn(PETSC_SUCCESS);
4269 }
4270 
4271 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4272 {
4273   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4274 
4275   PetscFunctionBegin;
4276   if (fs) {
4277 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4278     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4279     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4280     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4281     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4282     delete fs->workVector;
4283     fs->workVector = NULL;
4284 #endif
4285     delete fs->rpermIndices;
4286     delete fs->cpermIndices;
4287     fs->rpermIndices  = NULL;
4288     fs->cpermIndices  = NULL;
4289     fs->init_dev_prop = PETSC_FALSE;
4290 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4291     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4292     PetscCallCUDA(cudaFree(fs->csrColIdx));
4293     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4294     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4295     PetscCallCUDA(cudaFree(fs->csrVal));
4296     PetscCallCUDA(cudaFree(fs->diag));
4297     PetscCallCUDA(cudaFree(fs->X));
4298     PetscCallCUDA(cudaFree(fs->Y));
4299     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4300     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4301     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4302     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4303     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4304     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4305     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4306     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4307     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4308     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4309     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4310     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4311     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4312     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4313     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4314     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4315     PetscCall(PetscFree(fs->csrRowPtr_h));
4316     PetscCall(PetscFree(fs->csrVal_h));
4317     PetscCall(PetscFree(fs->diag_h));
4318     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4319     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4320 #endif
4321   }
4322   PetscFunctionReturn(PETSC_SUCCESS);
4323 }
4324 
4325 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4326 {
4327   PetscFunctionBegin;
4328   if (*trifactors) {
4329     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4330     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4331     PetscCall(PetscFree(*trifactors));
4332   }
4333   PetscFunctionReturn(PETSC_SUCCESS);
4334 }
4335 
4336 struct IJCompare {
4337   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4338   {
4339     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4340     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4341     return false;
4342   }
4343 };
4344 
4345 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4346 {
4347   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4348 
4349   PetscFunctionBegin;
4350   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4351   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4352   if (destroy) {
4353     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4354     delete cusp->csr2csc_i;
4355     cusp->csr2csc_i = NULL;
4356   }
4357   A->transupdated = PETSC_FALSE;
4358   PetscFunctionReturn(PETSC_SUCCESS);
4359 }
4360 
4361 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4362 {
4363   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4364 
4365   PetscFunctionBegin;
4366   PetscCallCUDA(cudaFree(coo->perm));
4367   PetscCallCUDA(cudaFree(coo->jmap));
4368   PetscCall(PetscFree(coo));
4369   PetscFunctionReturn(PETSC_SUCCESS);
4370 }
4371 
4372 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4373 {
4374   PetscBool            dev_ij = PETSC_FALSE;
4375   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4376   PetscInt            *i, *j;
4377   PetscContainer       container_h;
4378   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4379 
4380   PetscFunctionBegin;
4381   PetscCall(PetscGetMemType(coo_i, &mtype));
4382   if (PetscMemTypeDevice(mtype)) {
4383     dev_ij = PETSC_TRUE;
4384     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4385     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4386     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4387   } else {
4388     i = coo_i;
4389     j = coo_j;
4390   }
4391 
4392   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4393   if (dev_ij) PetscCall(PetscFree2(i, j));
4394   mat->offloadmask = PETSC_OFFLOAD_CPU;
4395   // Create the GPU memory
4396   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4397 
4398   // Copy the COO struct to device
4399   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4400   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4401   PetscCall(PetscMalloc1(1, &coo_d));
4402   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4403   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4404   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4405   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4406   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4407 
4408   // Put the COO struct in a container and then attach that to the matrix
4409   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4410   PetscFunctionReturn(PETSC_SUCCESS);
4411 }
4412 
4413 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4414 {
4415   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4416   const PetscCount grid_size = gridDim.x * blockDim.x;
4417   for (; i < nnz; i += grid_size) {
4418     PetscScalar sum = 0.0;
4419     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4420     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4421   }
4422 }
4423 
4424 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4425 {
4426   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4427   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428   PetscCount           Annz = seq->nz;
4429   PetscMemType         memtype;
4430   const PetscScalar   *v1 = v;
4431   PetscScalar         *Aa;
4432   PetscContainer       container;
4433   MatCOOStruct_SeqAIJ *coo;
4434 
4435   PetscFunctionBegin;
4436   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4437 
4438   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4439   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4440 
4441   PetscCall(PetscGetMemType(v, &memtype));
4442   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4443     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4444     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4445   }
4446 
4447   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4448   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4449 
4450   PetscCall(PetscLogGpuTimeBegin());
4451   if (Annz) {
4452     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4453     PetscCallCUDA(cudaPeekAtLastError());
4454   }
4455   PetscCall(PetscLogGpuTimeEnd());
4456 
4457   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4458   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4459 
4460   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4461   PetscFunctionReturn(PETSC_SUCCESS);
4462 }
4463 
4464 /*@C
4465   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4466 
4467   Not Collective
4468 
4469   Input Parameters:
4470 + A          - the matrix
4471 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4472 
4473   Output Parameters:
4474 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4475 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4476 
4477   Level: developer
4478 
4479   Note:
4480   When compressed is true, the CSR structure does not contain empty rows
4481 
4482 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4483 @*/
4484 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4485 {
4486   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4487   CsrMatrix          *csr;
4488   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4489 
4490   PetscFunctionBegin;
4491   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4492   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4493   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4494   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4495   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4496   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4497   csr = (CsrMatrix *)cusp->mat->mat;
4498   if (i) {
4499     if (!compressed && a->compressedrow.use) { /* need full row offset */
4500       if (!cusp->rowoffsets_gpu) {
4501         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4502         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4503         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4504       }
4505       *i = cusp->rowoffsets_gpu->data().get();
4506     } else *i = csr->row_offsets->data().get();
4507   }
4508   if (j) *j = csr->column_indices->data().get();
4509   PetscFunctionReturn(PETSC_SUCCESS);
4510 }
4511 
4512 /*@C
4513   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4514 
4515   Not Collective
4516 
4517   Input Parameters:
4518 + A          - the matrix
4519 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4520 . i          - the CSR row pointers
4521 - j          - the CSR column indices
4522 
4523   Level: developer
4524 
4525 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4526 @*/
4527 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4528 {
4529   PetscFunctionBegin;
4530   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4531   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532   if (i) *i = NULL;
4533   if (j) *j = NULL;
4534   (void)compressed;
4535   PetscFunctionReturn(PETSC_SUCCESS);
4536 }
4537 
4538 /*@C
4539   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4540 
4541   Not Collective
4542 
4543   Input Parameter:
4544 . A - a `MATSEQAIJCUSPARSE` matrix
4545 
4546   Output Parameter:
4547 . a - pointer to the device data
4548 
4549   Level: developer
4550 
4551   Note:
4552   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4553 
4554 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4555 @*/
4556 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4557 {
4558   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559   CsrMatrix          *csr;
4560 
4561   PetscFunctionBegin;
4562   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4563   PetscAssertPointer(a, 2);
4564   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4566   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4567   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568   csr = (CsrMatrix *)cusp->mat->mat;
4569   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570   *a = csr->values->data().get();
4571   PetscFunctionReturn(PETSC_SUCCESS);
4572 }
4573 
4574 /*@C
4575   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4576 
4577   Not Collective
4578 
4579   Input Parameters:
4580 + A - a `MATSEQAIJCUSPARSE` matrix
4581 - a - pointer to the device data
4582 
4583   Level: developer
4584 
4585 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4586 @*/
4587 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4588 {
4589   PetscFunctionBegin;
4590   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4591   PetscAssertPointer(a, 2);
4592   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4593   *a = NULL;
4594   PetscFunctionReturn(PETSC_SUCCESS);
4595 }
4596 
4597 /*@C
4598   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4599 
4600   Not Collective
4601 
4602   Input Parameter:
4603 . A - a `MATSEQAIJCUSPARSE` matrix
4604 
4605   Output Parameter:
4606 . a - pointer to the device data
4607 
4608   Level: developer
4609 
4610   Note:
4611   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4612 
4613 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4614 @*/
4615 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4616 {
4617   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4618   CsrMatrix          *csr;
4619 
4620   PetscFunctionBegin;
4621   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4622   PetscAssertPointer(a, 2);
4623   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4625   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4626   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4627   csr = (CsrMatrix *)cusp->mat->mat;
4628   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4629   *a             = csr->values->data().get();
4630   A->offloadmask = PETSC_OFFLOAD_GPU;
4631   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4632   PetscFunctionReturn(PETSC_SUCCESS);
4633 }
4634 /*@C
4635   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4636 
4637   Not Collective
4638 
4639   Input Parameters:
4640 + A - a `MATSEQAIJCUSPARSE` matrix
4641 - a - pointer to the device data
4642 
4643   Level: developer
4644 
4645 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4646 @*/
4647 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4648 {
4649   PetscFunctionBegin;
4650   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4651   PetscAssertPointer(a, 2);
4652   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4653   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4654   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4655   *a = NULL;
4656   PetscFunctionReturn(PETSC_SUCCESS);
4657 }
4658 
4659 /*@C
4660   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4661 
4662   Not Collective
4663 
4664   Input Parameter:
4665 . A - a `MATSEQAIJCUSPARSE` matrix
4666 
4667   Output Parameter:
4668 . a - pointer to the device data
4669 
4670   Level: developer
4671 
4672   Note:
4673   Does not trigger any host to device copies.
4674 
4675   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4676 
4677 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4678 @*/
4679 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4680 {
4681   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4682   CsrMatrix          *csr;
4683 
4684   PetscFunctionBegin;
4685   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4686   PetscAssertPointer(a, 2);
4687   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4688   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4689   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4690   csr = (CsrMatrix *)cusp->mat->mat;
4691   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4692   *a             = csr->values->data().get();
4693   A->offloadmask = PETSC_OFFLOAD_GPU;
4694   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4695   PetscFunctionReturn(PETSC_SUCCESS);
4696 }
4697 
4698 /*@C
4699   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4700 
4701   Not Collective
4702 
4703   Input Parameters:
4704 + A - a `MATSEQAIJCUSPARSE` matrix
4705 - a - pointer to the device data
4706 
4707   Level: developer
4708 
4709 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4710 @*/
4711 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4712 {
4713   PetscFunctionBegin;
4714   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4715   PetscAssertPointer(a, 2);
4716   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4717   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4718   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4719   *a = NULL;
4720   PetscFunctionReturn(PETSC_SUCCESS);
4721 }
4722 
4723 struct IJCompare4 {
4724   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4725   {
4726     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4727     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4728     return false;
4729   }
4730 };
4731 
4732 struct Shift {
4733   int _shift;
4734 
4735   Shift(int shift) : _shift(shift) { }
4736   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4737 };
4738 
4739 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4740 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4741 {
4742   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4743   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4744   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4745   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4746   PetscInt                      Annz, Bnnz;
4747   cusparseStatus_t              stat;
4748   PetscInt                      i, m, n, zero = 0;
4749 
4750   PetscFunctionBegin;
4751   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4752   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4753   PetscAssertPointer(C, 4);
4754   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4755   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4756   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4757   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4758   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4759   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4760   if (reuse == MAT_INITIAL_MATRIX) {
4761     m = A->rmap->n;
4762     n = A->cmap->n + B->cmap->n;
4763     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4764     PetscCall(MatSetSizes(*C, m, n, m, n));
4765     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4766     c                       = (Mat_SeqAIJ *)(*C)->data;
4767     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4768     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4769     Ccsr                    = new CsrMatrix;
4770     Cmat->cprowIndices      = NULL;
4771     c->compressedrow.use    = PETSC_FALSE;
4772     c->compressedrow.nrows  = 0;
4773     c->compressedrow.i      = NULL;
4774     c->compressedrow.rindex = NULL;
4775     Ccusp->workVector       = NULL;
4776     Ccusp->nrows            = m;
4777     Ccusp->mat              = Cmat;
4778     Ccusp->mat->mat         = Ccsr;
4779     Ccsr->num_rows          = m;
4780     Ccsr->num_cols          = n;
4781     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4782     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4783     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4784     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4785     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4786     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4787     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4788     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4789     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4790     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4791     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4792     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4793     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4794 
4795     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4796     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4797     Annz                 = (PetscInt)Acsr->column_indices->size();
4798     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4799     c->nz                = Annz + Bnnz;
4800     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4801     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4802     Ccsr->values         = new THRUSTARRAY(c->nz);
4803     Ccsr->num_entries    = c->nz;
4804     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4805     if (c->nz) {
4806       auto              Acoo = new THRUSTINTARRAY32(Annz);
4807       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4808       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4809       THRUSTINTARRAY32 *Aroff, *Broff;
4810 
4811       if (a->compressedrow.use) { /* need full row offset */
4812         if (!Acusp->rowoffsets_gpu) {
4813           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4814           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4815           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4816         }
4817         Aroff = Acusp->rowoffsets_gpu;
4818       } else Aroff = Acsr->row_offsets;
4819       if (b->compressedrow.use) { /* need full row offset */
4820         if (!Bcusp->rowoffsets_gpu) {
4821           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4822           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4823           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4824         }
4825         Broff = Bcusp->rowoffsets_gpu;
4826       } else Broff = Bcsr->row_offsets;
4827       PetscCall(PetscLogGpuTimeBegin());
4828       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4829       PetscCallCUSPARSE(stat);
4830       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4831       PetscCallCUSPARSE(stat);
4832       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4833       auto Aperm = thrust::make_constant_iterator(1);
4834       auto Bperm = thrust::make_constant_iterator(0);
4835 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4836       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4837       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4838 #else
4839       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4840       auto Bcib = Bcsr->column_indices->begin();
4841       auto Bcie = Bcsr->column_indices->end();
4842       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4843 #endif
4844       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4845       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4846       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4847       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4848       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4849       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4850       auto p1    = Ccusp->coords->begin();
4851       auto p2    = Ccusp->coords->begin();
4852       thrust::advance(p2, Annz);
4853       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4854 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4855       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4856 #endif
4857       auto cci = thrust::make_counting_iterator(zero);
4858       auto cce = thrust::make_counting_iterator(c->nz);
4859 #if 0 //Errors on SUMMIT cuda 11.1.0
4860       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4861 #else
4862   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4863       auto pred = thrust::identity<int>();
4864   #else
4865       auto pred = cuda::std::identity();
4866   #endif
4867       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4868       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4869 #endif
4870       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4871       PetscCallCUSPARSE(stat);
4872       PetscCall(PetscLogGpuTimeEnd());
4873       delete wPerm;
4874       delete Acoo;
4875       delete Bcoo;
4876       delete Ccoo;
4877 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4878       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4879       PetscCallCUSPARSE(stat);
4880 #endif
4881       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4882         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4883         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4884         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4885         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4886         CsrMatrix                    *CcsrT = new CsrMatrix;
4887         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4888         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4889 
4890         (*C)->form_explicit_transpose = PETSC_TRUE;
4891         (*C)->transupdated            = PETSC_TRUE;
4892         Ccusp->rowoffsets_gpu         = NULL;
4893         CmatT->cprowIndices           = NULL;
4894         CmatT->mat                    = CcsrT;
4895         CcsrT->num_rows               = n;
4896         CcsrT->num_cols               = m;
4897         CcsrT->num_entries            = c->nz;
4898 
4899         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4900         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4901         CcsrT->values         = new THRUSTARRAY(c->nz);
4902 
4903         PetscCall(PetscLogGpuTimeBegin());
4904         auto rT = CcsrT->row_offsets->begin();
4905         if (AT) {
4906           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4907           thrust::advance(rT, -1);
4908         }
4909         if (BT) {
4910           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4911           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4912           thrust::copy(titb, tite, rT);
4913         }
4914         auto cT = CcsrT->column_indices->begin();
4915         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4916         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4917         auto vT = CcsrT->values->begin();
4918         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4919         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4920         PetscCall(PetscLogGpuTimeEnd());
4921 
4922         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4923         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4924         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4925         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4926         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4927         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4928         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4929         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4930         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4931 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4932         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4933         PetscCallCUSPARSE(stat);
4934 #endif
4935         Ccusp->matTranspose = CmatT;
4936       }
4937     }
4938 
4939     c->free_a = PETSC_TRUE;
4940     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4941     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4942     c->free_ij = PETSC_TRUE;
4943     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4944       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4945       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4946       ii = *Ccsr->row_offsets;
4947       jj = *Ccsr->column_indices;
4948       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4949       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4950     } else {
4951       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4952       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4953     }
4954     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4955     PetscCall(PetscMalloc1(m, &c->ilen));
4956     PetscCall(PetscMalloc1(m, &c->imax));
4957     c->maxnz         = c->nz;
4958     c->nonzerorowcnt = 0;
4959     c->rmax          = 0;
4960     for (i = 0; i < m; i++) {
4961       const PetscInt nn = c->i[i + 1] - c->i[i];
4962       c->ilen[i] = c->imax[i] = nn;
4963       c->nonzerorowcnt += (PetscInt)!!nn;
4964       c->rmax = PetscMax(c->rmax, nn);
4965     }
4966     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4967     PetscCall(PetscMalloc1(c->nz, &c->a));
4968     (*C)->nonzerostate++;
4969     PetscCall(PetscLayoutSetUp((*C)->rmap));
4970     PetscCall(PetscLayoutSetUp((*C)->cmap));
4971     Ccusp->nonzerostate = (*C)->nonzerostate;
4972     (*C)->preallocated  = PETSC_TRUE;
4973   } else {
4974     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4975     c = (Mat_SeqAIJ *)(*C)->data;
4976     if (c->nz) {
4977       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4978       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4979       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4980       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4981       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4982       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4983       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4984       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4985       Acsr = (CsrMatrix *)Acusp->mat->mat;
4986       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4987       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4988       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4989       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4990       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4991       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4992       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4993       auto pmid = Ccusp->coords->begin();
4994       thrust::advance(pmid, Acsr->num_entries);
4995       PetscCall(PetscLogGpuTimeBegin());
4996       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4997       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4998       thrust::for_each(zibait, zieait, VecCUDAEquals());
4999       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5000       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5001       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5002       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5003       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5004         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5005         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5006         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5007         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5008         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5009         auto       vT    = CcsrT->values->begin();
5010         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5011         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5012         (*C)->transupdated = PETSC_TRUE;
5013       }
5014       PetscCall(PetscLogGpuTimeEnd());
5015     }
5016   }
5017   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5018   (*C)->assembled     = PETSC_TRUE;
5019   (*C)->was_assembled = PETSC_FALSE;
5020   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5021   PetscFunctionReturn(PETSC_SUCCESS);
5022 }
5023 
5024 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5025 {
5026   bool               dmem;
5027   const PetscScalar *av;
5028 
5029   PetscFunctionBegin;
5030   dmem = isCudaMem(v);
5031   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5032   if (n && idx) {
5033     THRUSTINTARRAY widx(n);
5034     widx.assign(idx, idx + n);
5035     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5036 
5037     THRUSTARRAY                    *w = NULL;
5038     thrust::device_ptr<PetscScalar> dv;
5039     if (dmem) {
5040       dv = thrust::device_pointer_cast(v);
5041     } else {
5042       w  = new THRUSTARRAY(n);
5043       dv = w->data();
5044     }
5045     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5046 
5047     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5048     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5049     thrust::for_each(zibit, zieit, VecCUDAEquals());
5050     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5051     delete w;
5052   } else {
5053     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5054   }
5055   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5056   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5057   PetscFunctionReturn(PETSC_SUCCESS);
5058 }
5059