xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6bf8f4d0f708ec00e4c420f9dda62f0349c612b8)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24   #include <cuda/std/functional>
25 #endif
26 
27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29 /*
30   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32 */
33 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36 #endif
37 
38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48 #endif
49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
59 
60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
64 
65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
67 
68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71 
72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
75 
76   PetscFunctionBegin;
77   switch (op) {
78   case MAT_CUSPARSE_MULT:
79     cusparsestruct->format = format;
80     break;
81   case MAT_CUSPARSE_ALL:
82     cusparsestruct->format = format;
83     break;
84   default:
85     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86   }
87   PetscFunctionReturn(PETSC_SUCCESS);
88 }
89 
90 /*@
91   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92   operation. Only the `MatMult()` operation can use different GPU storage formats
93 
94   Not Collective
95 
96   Input Parameters:
97 + A      - Matrix of type `MATSEQAIJCUSPARSE`
98 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101 
102   Level: intermediate
103 
104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105 @*/
106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107 {
108   PetscFunctionBegin;
109   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117 
118   PetscFunctionBegin;
119   cusparsestruct->use_cpu_solve = use_cpu;
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /*@
124   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125 
126   Input Parameters:
127 + A       - Matrix of type `MATSEQAIJCUSPARSE`
128 - use_cpu - set flag for using the built-in CPU `MatSolve()`
129 
130   Level: intermediate
131 
132   Note:
133   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136 
137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138 @*/
139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140 {
141   PetscFunctionBegin;
142   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148 {
149   PetscFunctionBegin;
150   switch (op) {
151   case MAT_FORM_EXPLICIT_TRANSPOSE:
152     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154     A->form_explicit_transpose = flg;
155     break;
156   default:
157     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158     break;
159   }
160   PetscFunctionReturn(PETSC_SUCCESS);
161 }
162 
163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164 {
165   MatCUSPARSEStorageFormat format;
166   PetscBool                flg;
167   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
168 
169   PetscFunctionBegin;
170   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171   if (A->factortype == MAT_FACTOR_NONE) {
172     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174 
175     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184   #else
185     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186   #endif
187     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189 
190     PetscCall(
191       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193 #endif
194   }
195   PetscOptionsHeadEnd();
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201 {
202   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203   PetscInt                      m  = A->rmap->n;
204   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
206   const MatScalar              *Aa = a->a;
207   PetscInt                     *Mi, *Mj, Mnz;
208   PetscScalar                  *Ma;
209 
210   PetscFunctionBegin;
211   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
212   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
213     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
214       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
215       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
216       PetscCall(PetscMalloc1(m + 1, &Mi));
217       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
218       PetscCall(PetscMalloc1(Mnz, &Ma));
219       Mi[0] = 0;
220       for (PetscInt i = 0; i < m; i++) {
221         PetscInt llen = Ai[i + 1] - Ai[i];
222         PetscInt ulen = adiag[i] - adiag[i + 1];
223         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
224         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
225         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
226         Mi[i + 1] = Mi[i] + llen + ulen;
227       }
228       // Copy M (L,U) from host to device
229       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
230       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
231       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
232       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
233       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
234 
235       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
236       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
237       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
238       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
239       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
240       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
241       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
242       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
243 
244       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
245       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
246       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
247 
248       fillMode = CUSPARSE_FILL_MODE_UPPER;
249       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
250       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
251       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
252       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
253 
254       // Allocate work vectors in SpSv
255       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
256       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
257 
258       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
259       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
260 
261       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
262       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
263       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
264       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
265       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
266       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
267       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
268 
269       // Record for reuse
270       fs->csrRowPtr_h = Mi;
271       fs->csrVal_h    = Ma;
272       PetscCall(PetscFree(Mj));
273     }
274     // Copy the value
275     Mi  = fs->csrRowPtr_h;
276     Ma  = fs->csrVal_h;
277     Mnz = Mi[m];
278     for (PetscInt i = 0; i < m; i++) {
279       PetscInt llen = Ai[i + 1] - Ai[i];
280       PetscInt ulen = adiag[i] - adiag[i + 1];
281       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
282       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
283       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
284     }
285     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
286 
287   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
288     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
289       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
290       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292     } else
293   #endif
294     {
295       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
296       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
297 
298       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
299       fs->updatedSpSVAnalysis          = PETSC_TRUE;
300       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
301     }
302   }
303   PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 #else
306 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
307 {
308   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
309   PetscInt                           n                  = A->rmap->n;
310   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
311   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
312   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
313   const MatScalar                   *aa = a->a, *v;
314   PetscInt                          *AiLo, *AjLo;
315   PetscInt                           i, nz, nzLower, offset, rowOffset;
316 
317   PetscFunctionBegin;
318   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
319   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
320     try {
321       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
322       nzLower = n + ai[n] - ai[1];
323       if (!loTriFactor) {
324         PetscScalar *AALo;
325 
326         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
327 
328         /* Allocate Space for the lower triangular matrix */
329         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
330         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
331 
332         /* Fill the lower triangular matrix */
333         AiLo[0]   = (PetscInt)0;
334         AiLo[n]   = nzLower;
335         AjLo[0]   = (PetscInt)0;
336         AALo[0]   = (MatScalar)1.0;
337         v         = aa;
338         vi        = aj;
339         offset    = 1;
340         rowOffset = 1;
341         for (i = 1; i < n; i++) {
342           nz = ai[i + 1] - ai[i];
343           /* additional 1 for the term on the diagonal */
344           AiLo[i] = rowOffset;
345           rowOffset += nz + 1;
346 
347           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
348           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
349 
350           offset += nz;
351           AjLo[offset] = (PetscInt)i;
352           AALo[offset] = (MatScalar)1.0;
353           offset += 1;
354 
355           v += nz;
356           vi += nz;
357         }
358 
359         /* allocate space for the triangular factor information */
360         PetscCall(PetscNew(&loTriFactor));
361         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
362         /* Create the matrix description */
363         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
364         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
365   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
366         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
367   #else
368         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
369   #endif
370         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
371         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
372 
373         /* set the operation */
374         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
375 
376         /* set the matrix */
377         loTriFactor->csrMat              = new CsrMatrix;
378         loTriFactor->csrMat->num_rows    = n;
379         loTriFactor->csrMat->num_cols    = n;
380         loTriFactor->csrMat->num_entries = nzLower;
381 
382         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
383         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
384 
385         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
386         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
387 
388         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
389         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
390 
391         /* Create the solve analysis information */
392         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
393         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
394   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
395         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
396                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
397         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
398   #endif
399 
400         /* perform the solve analysis */
401         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
402                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
403         PetscCallCUDA(WaitForCUDA());
404         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
405 
406         /* assign the pointer */
407         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
408         loTriFactor->AA_h                                          = AALo;
409         PetscCallCUDA(cudaFreeHost(AiLo));
410         PetscCallCUDA(cudaFreeHost(AjLo));
411         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
412       } else { /* update values only */
413         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
414         /* Fill the lower triangular matrix */
415         loTriFactor->AA_h[0] = 1.0;
416         v                    = aa;
417         vi                   = aj;
418         offset               = 1;
419         for (i = 1; i < n; i++) {
420           nz = ai[i + 1] - ai[i];
421           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
422           offset += nz;
423           loTriFactor->AA_h[offset] = 1.0;
424           offset += 1;
425           v += nz;
426         }
427         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
428         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
429       }
430     } catch (char *ex) {
431       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
438 {
439   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
440   PetscInt                           n                  = A->rmap->n;
441   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
442   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
443   const PetscInt                    *aj                 = a->j, *adiag, *vi;
444   const MatScalar                   *aa                 = a->a, *v;
445   PetscInt                          *AiUp, *AjUp;
446   PetscInt                           i, nz, nzUpper, offset;
447 
448   PetscFunctionBegin;
449   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
450   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
451   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
452     try {
453       /* next, figure out the number of nonzeros in the upper triangular matrix. */
454       nzUpper = adiag[0] - adiag[n];
455       if (!upTriFactor) {
456         PetscScalar *AAUp;
457 
458         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
459 
460         /* Allocate Space for the upper triangular matrix */
461         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
462         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
463 
464         /* Fill the upper triangular matrix */
465         AiUp[0] = (PetscInt)0;
466         AiUp[n] = nzUpper;
467         offset  = nzUpper;
468         for (i = n - 1; i >= 0; i--) {
469           v  = aa + adiag[i + 1] + 1;
470           vi = aj + adiag[i + 1] + 1;
471 
472           /* number of elements NOT on the diagonal */
473           nz = adiag[i] - adiag[i + 1] - 1;
474 
475           /* decrement the offset */
476           offset -= (nz + 1);
477 
478           /* first, set the diagonal elements */
479           AjUp[offset] = (PetscInt)i;
480           AAUp[offset] = (MatScalar)1. / v[nz];
481           AiUp[i]      = AiUp[i + 1] - (nz + 1);
482 
483           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
484           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
485         }
486 
487         /* allocate space for the triangular factor information */
488         PetscCall(PetscNew(&upTriFactor));
489         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
490 
491         /* Create the matrix description */
492         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
493         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
494   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
495         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
496   #else
497         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
498   #endif
499         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
500         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
501 
502         /* set the operation */
503         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
504 
505         /* set the matrix */
506         upTriFactor->csrMat              = new CsrMatrix;
507         upTriFactor->csrMat->num_rows    = n;
508         upTriFactor->csrMat->num_cols    = n;
509         upTriFactor->csrMat->num_entries = nzUpper;
510 
511         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
512         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
513 
514         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
515         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
516 
517         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
518         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
519 
520         /* Create the solve analysis information */
521         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
522         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
523   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
524         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
525                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
526         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
527   #endif
528 
529         /* perform the solve analysis */
530         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
531                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
532 
533         PetscCallCUDA(WaitForCUDA());
534         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
535 
536         /* assign the pointer */
537         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
538         upTriFactor->AA_h                                          = AAUp;
539         PetscCallCUDA(cudaFreeHost(AiUp));
540         PetscCallCUDA(cudaFreeHost(AjUp));
541         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
542       } else {
543         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
544         /* Fill the upper triangular matrix */
545         offset = nzUpper;
546         for (i = n - 1; i >= 0; i--) {
547           v = aa + adiag[i + 1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i + 1] - 1;
551 
552           /* decrement the offset */
553           offset -= (nz + 1);
554 
555           /* first, set the diagonal elements */
556           upTriFactor->AA_h[offset] = 1. / v[nz];
557           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
558         }
559         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
560         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
561       }
562     } catch (char *ex) {
563       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
564     }
565   }
566   PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 #endif
569 
570 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
571 {
572   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
573   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
574   IS                            isrow = a->row, isicol = a->icol;
575   PetscBool                     row_identity, col_identity;
576   PetscInt                      n = A->rmap->n;
577 
578   PetscFunctionBegin;
579   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
580 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
581   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
582 #else
583   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
584   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
585   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
586 #endif
587 
588   cusparseTriFactors->nnz = a->nz;
589 
590   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
591   /* lower triangular indices */
592   PetscCall(ISIdentity(isrow, &row_identity));
593   if (!row_identity && !cusparseTriFactors->rpermIndices) {
594     const PetscInt *r;
595 
596     PetscCall(ISGetIndices(isrow, &r));
597     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
598     cusparseTriFactors->rpermIndices->assign(r, r + n);
599     PetscCall(ISRestoreIndices(isrow, &r));
600     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
601   }
602 
603   /* upper triangular indices */
604   PetscCall(ISIdentity(isicol, &col_identity));
605   if (!col_identity && !cusparseTriFactors->cpermIndices) {
606     const PetscInt *c;
607 
608     PetscCall(ISGetIndices(isicol, &c));
609     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
610     cusparseTriFactors->cpermIndices->assign(c, c + n);
611     PetscCall(ISRestoreIndices(isicol, &c));
612     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
613   }
614   PetscFunctionReturn(PETSC_SUCCESS);
615 }
616 
617 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
618 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
619 {
620   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
621   PetscInt                      m  = A->rmap->n;
622   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
623   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
624   const MatScalar              *Aa = a->a;
625   PetscInt                     *Mj, Mnz;
626   PetscScalar                  *Ma, *D;
627 
628   PetscFunctionBegin;
629   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
630   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
631     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
632       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
633       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
634       Mnz = Ai[m]; // Unz (with the unit diagonal)
635       PetscCall(PetscMalloc1(Mnz, &Ma));
636       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
637       PetscCall(PetscMalloc1(m, &D));    // the diagonal
638       for (PetscInt i = 0; i < m; i++) {
639         PetscInt ulen = Ai[i + 1] - Ai[i];
640         Mj[Ai[i]]     = i;                                              // diagonal entry
641         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
642       }
643       // Copy M (U) from host to device
644       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
645       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
646       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
647       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
648       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
649       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
650 
651       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
652       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
653       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
654       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
655       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
656       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
657       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
658       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
659 
660       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
661       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
662       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
663 
664       // Allocate work vectors in SpSv
665       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
666       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
667 
668       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
669       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
670 
671       // Query buffer sizes for SpSV and then allocate buffers
672       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
673       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
674       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
675 
676       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
677       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
678       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
679 
680       // Record for reuse
681       fs->csrVal_h = Ma;
682       fs->diag_h   = D;
683       PetscCall(PetscFree(Mj));
684     }
685     // Copy the value
686     Ma  = fs->csrVal_h;
687     D   = fs->diag_h;
688     Mnz = Ai[m];
689     for (PetscInt i = 0; i < m; i++) {
690       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
691       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
692       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
693     }
694     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
695     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
696 
697   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
698     if (fs->updatedSpSVAnalysis) {
699       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
700       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701     } else
702   #endif
703     {
704       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
705       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
706       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
707       fs->updatedSpSVAnalysis = PETSC_TRUE;
708     }
709   }
710   PetscFunctionReturn(PETSC_SUCCESS);
711 }
712 
713 // Solve Ut D U x = b
714 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
715 {
716   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
717   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
718   const PetscScalar                    *barray;
719   PetscScalar                          *xarray;
720   thrust::device_ptr<const PetscScalar> bGPU;
721   thrust::device_ptr<PetscScalar>       xGPU;
722   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
723   PetscInt                              m   = A->rmap->n;
724 
725   PetscFunctionBegin;
726   PetscCall(PetscLogGpuTimeBegin());
727   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
728   PetscCall(VecCUDAGetArrayRead(b, &barray));
729   xGPU = thrust::device_pointer_cast(xarray);
730   bGPU = thrust::device_pointer_cast(barray);
731 
732   // Reorder b with the row permutation if needed, and wrap the result in fs->X
733   if (fs->rpermIndices) {
734     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
735     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
736   } else {
737     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
738   }
739 
740   // Solve Ut Y = X
741   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
742   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
743 
744   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
745   // It is basically a vector element-wise multiplication, but cublas does not have it!
746   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
747 
748   // Solve U X = Y
749   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
750     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
751   } else {
752     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
753   }
754   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
755 
756   // Reorder X with the column permutation if needed, and put the result back to x
757   if (fs->cpermIndices) {
758     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
759                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
760   }
761 
762   PetscCall(VecCUDARestoreArrayRead(b, &barray));
763   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
764   PetscCall(PetscLogGpuTimeEnd());
765   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
766   PetscFunctionReturn(PETSC_SUCCESS);
767 }
768 #else
769 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
770 {
771   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
772   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
773   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
774   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
775   PetscInt                          *AiUp, *AjUp;
776   PetscScalar                       *AAUp;
777   PetscScalar                       *AALo;
778   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
779   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
780   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
781   const MatScalar                   *aa = b->a, *v;
782 
783   PetscFunctionBegin;
784   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
785   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
786     try {
787       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
788       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
789       if (!upTriFactor && !loTriFactor) {
790         /* Allocate Space for the upper triangular matrix */
791         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
792         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
793 
794         /* Fill the upper triangular matrix */
795         AiUp[0] = (PetscInt)0;
796         AiUp[n] = nzUpper;
797         offset  = 0;
798         for (i = 0; i < n; i++) {
799           /* set the pointers */
800           v  = aa + ai[i];
801           vj = aj + ai[i];
802           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
803 
804           /* first, set the diagonal elements */
805           AjUp[offset] = (PetscInt)i;
806           AAUp[offset] = (MatScalar)1.0 / v[nz];
807           AiUp[i]      = offset;
808           AALo[offset] = (MatScalar)1.0 / v[nz];
809 
810           offset += 1;
811           if (nz > 0) {
812             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
813             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
814             for (j = offset; j < offset + nz; j++) {
815               AAUp[j] = -AAUp[j];
816               AALo[j] = AAUp[j] / v[nz];
817             }
818             offset += nz;
819           }
820         }
821 
822         /* allocate space for the triangular factor information */
823         PetscCall(PetscNew(&upTriFactor));
824         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
825 
826         /* Create the matrix description */
827         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
828         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
829   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
830         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
831   #else
832         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
833   #endif
834         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
835         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
836 
837         /* set the matrix */
838         upTriFactor->csrMat              = new CsrMatrix;
839         upTriFactor->csrMat->num_rows    = A->rmap->n;
840         upTriFactor->csrMat->num_cols    = A->cmap->n;
841         upTriFactor->csrMat->num_entries = a->nz;
842 
843         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
844         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
845 
846         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
847         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
848 
849         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
850         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
851 
852         /* set the operation */
853         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
854 
855         /* Create the solve analysis information */
856         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
857         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
858   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
859         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
860                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
861         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
862   #endif
863 
864         /* perform the solve analysis */
865         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
866                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
867 
868         PetscCallCUDA(WaitForCUDA());
869         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
870 
871         /* assign the pointer */
872         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
873 
874         /* allocate space for the triangular factor information */
875         PetscCall(PetscNew(&loTriFactor));
876         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
877 
878         /* Create the matrix description */
879         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
880         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
881   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
882         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
883   #else
884         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
885   #endif
886         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
887         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
888 
889         /* set the operation */
890         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
891 
892         /* set the matrix */
893         loTriFactor->csrMat              = new CsrMatrix;
894         loTriFactor->csrMat->num_rows    = A->rmap->n;
895         loTriFactor->csrMat->num_cols    = A->cmap->n;
896         loTriFactor->csrMat->num_entries = a->nz;
897 
898         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
899         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
900 
901         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
902         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
903 
904         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
905         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
906 
907         /* Create the solve analysis information */
908         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
909         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
910   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
911         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
912                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
913         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
914   #endif
915 
916         /* perform the solve analysis */
917         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
918                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
919 
920         PetscCallCUDA(WaitForCUDA());
921         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
922 
923         /* assign the pointer */
924         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
925 
926         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
927         PetscCallCUDA(cudaFreeHost(AiUp));
928         PetscCallCUDA(cudaFreeHost(AjUp));
929       } else {
930         /* Fill the upper triangular matrix */
931         offset = 0;
932         for (i = 0; i < n; i++) {
933           /* set the pointers */
934           v  = aa + ai[i];
935           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
936 
937           /* first, set the diagonal elements */
938           AAUp[offset] = 1.0 / v[nz];
939           AALo[offset] = 1.0 / v[nz];
940 
941           offset += 1;
942           if (nz > 0) {
943             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
944             for (j = offset; j < offset + nz; j++) {
945               AAUp[j] = -AAUp[j];
946               AALo[j] = AAUp[j] / v[nz];
947             }
948             offset += nz;
949           }
950         }
951         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
952         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
953         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
954         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
955         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
956       }
957       PetscCallCUDA(cudaFreeHost(AAUp));
958       PetscCallCUDA(cudaFreeHost(AALo));
959     } catch (char *ex) {
960       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
961     }
962   }
963   PetscFunctionReturn(PETSC_SUCCESS);
964 }
965 #endif
966 
967 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
968 {
969   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
970   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
971   IS                            ip                 = a->row;
972   PetscBool                     perm_identity;
973   PetscInt                      n = A->rmap->n;
974 
975   PetscFunctionBegin;
976   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
977 
978 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
979   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
980 #else
981   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
982   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
983 #endif
984   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
985 
986   A->offloadmask = PETSC_OFFLOAD_BOTH;
987 
988   /* lower triangular indices */
989   PetscCall(ISIdentity(ip, &perm_identity));
990   if (!perm_identity) {
991     IS              iip;
992     const PetscInt *irip, *rip;
993 
994     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
995     PetscCall(ISGetIndices(iip, &irip));
996     PetscCall(ISGetIndices(ip, &rip));
997     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
998     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
999     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1000     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1001     PetscCall(ISRestoreIndices(iip, &irip));
1002     PetscCall(ISDestroy(&iip));
1003     PetscCall(ISRestoreIndices(ip, &rip));
1004     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1005   }
1006   PetscFunctionReturn(PETSC_SUCCESS);
1007 }
1008 
1009 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1010 {
1011   PetscFunctionBegin;
1012   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1013   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1014   B->offloadmask = PETSC_OFFLOAD_CPU;
1015 
1016 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1017   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1018   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1019 #else
1020   /* determine which version of MatSolve needs to be used. */
1021   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1022   IS          ip = b->row;
1023   PetscBool   perm_identity;
1024 
1025   PetscCall(ISIdentity(ip, &perm_identity));
1026   if (perm_identity) {
1027     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1028     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1029   } else {
1030     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1031     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1032   }
1033 #endif
1034   B->ops->matsolve          = NULL;
1035   B->ops->matsolvetranspose = NULL;
1036 
1037   /* get the triangular factors */
1038   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1039   PetscFunctionReturn(PETSC_SUCCESS);
1040 }
1041 
1042 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1043 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1044 {
1045   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1046   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1047   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1048   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1050   cusparseIndexBase_t                indexBase;
1051   cusparseMatrixType_t               matrixType;
1052   cusparseFillMode_t                 fillMode;
1053   cusparseDiagType_t                 diagType;
1054 
1055   PetscFunctionBegin;
1056   /* allocate space for the transpose of the lower triangular factor */
1057   PetscCall(PetscNew(&loTriFactorT));
1058   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1059 
1060   /* set the matrix descriptors of the lower triangular factor */
1061   matrixType = cusparseGetMatType(loTriFactor->descr);
1062   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1063   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1064   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1065 
1066   /* Create the matrix description */
1067   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1068   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1069   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1070   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1071   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1072 
1073   /* set the operation */
1074   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1075 
1076   /* allocate GPU space for the CSC of the lower triangular factor*/
1077   loTriFactorT->csrMat                 = new CsrMatrix;
1078   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1079   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1080   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1081   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1082   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1083   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1084 
1085   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1086   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1087   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1088                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1089                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1090   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1091   #endif
1092 
1093   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1094   {
1095     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1096     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1097                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1098   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1099                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1100   #else
1101                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1102   #endif
1103     PetscCallCUSPARSE(stat);
1104   }
1105 
1106   PetscCallCUDA(WaitForCUDA());
1107   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1108 
1109   /* Create the solve analysis information */
1110   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1111   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1112   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1113   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1114                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1115   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1116   #endif
1117 
1118   /* perform the solve analysis */
1119   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1120                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1121 
1122   PetscCallCUDA(WaitForCUDA());
1123   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1124 
1125   /* assign the pointer */
1126   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1127 
1128   /*********************************************/
1129   /* Now the Transpose of the Upper Tri Factor */
1130   /*********************************************/
1131 
1132   /* allocate space for the transpose of the upper triangular factor */
1133   PetscCall(PetscNew(&upTriFactorT));
1134   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1135 
1136   /* set the matrix descriptors of the upper triangular factor */
1137   matrixType = cusparseGetMatType(upTriFactor->descr);
1138   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1139   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1140   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1141 
1142   /* Create the matrix description */
1143   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1144   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1145   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1146   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1147   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1148 
1149   /* set the operation */
1150   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1151 
1152   /* allocate GPU space for the CSC of the upper triangular factor*/
1153   upTriFactorT->csrMat                 = new CsrMatrix;
1154   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1155   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1156   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1157   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1158   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1159   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1160 
1161   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1162   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1163   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1164                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1166   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1167   #endif
1168 
1169   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1170   {
1171     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1172     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1173                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1174   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1175                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1176   #else
1177                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1178   #endif
1179     PetscCallCUSPARSE(stat);
1180   }
1181 
1182   PetscCallCUDA(WaitForCUDA());
1183   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1184 
1185   /* Create the solve analysis information */
1186   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1187   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1188   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1189   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1190                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1191   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1192   #endif
1193 
1194   /* perform the solve analysis */
1195   /* christ, would it have killed you to put this stuff in a function????????? */
1196   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1197                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1198 
1199   PetscCallCUDA(WaitForCUDA());
1200   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1201 
1202   /* assign the pointer */
1203   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1204   PetscFunctionReturn(PETSC_SUCCESS);
1205 }
1206 #endif
1207 
1208 struct PetscScalarToPetscInt {
1209   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1210 };
1211 
1212 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1213 {
1214   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1215   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1216   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1217   cusparseStatus_t              stat;
1218   cusparseIndexBase_t           indexBase;
1219 
1220   PetscFunctionBegin;
1221   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1222   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1223   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1224   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1225   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1226   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1227   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1228   PetscCall(PetscLogGpuTimeBegin());
1229   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1230   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1233     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1235     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1236 
1237     /* set alpha and beta */
1238     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1239     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1240     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1241     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1242     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1243     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1244 
1245     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246       CsrMatrix *matrixT      = new CsrMatrix;
1247       matstructT->mat         = matrixT;
1248       matrixT->num_rows       = A->cmap->n;
1249       matrixT->num_cols       = A->rmap->n;
1250       matrixT->num_entries    = a->nz;
1251       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1252       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253       matrixT->values         = new THRUSTARRAY(a->nz);
1254 
1255       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1256       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1257 
1258 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1259   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1260       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1261                                indexBase, cusparse_scalartype);
1262       PetscCallCUSPARSE(stat);
1263   #else
1264       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1265            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1266 
1267            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1268            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1269            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1270         */
1271       if (matrixT->num_entries) {
1272         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1273         PetscCallCUSPARSE(stat);
1274 
1275       } else {
1276         matstructT->matDescr = NULL;
1277         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1278       }
1279   #endif
1280 #endif
1281     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1282 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1283       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1284 #else
1285       CsrMatrix *temp  = new CsrMatrix;
1286       CsrMatrix *tempT = new CsrMatrix;
1287       /* First convert HYB to CSR */
1288       temp->num_rows       = A->rmap->n;
1289       temp->num_cols       = A->cmap->n;
1290       temp->num_entries    = a->nz;
1291       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1292       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1293       temp->values         = new THRUSTARRAY(a->nz);
1294 
1295       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1296       PetscCallCUSPARSE(stat);
1297 
1298       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1299       tempT->num_rows       = A->rmap->n;
1300       tempT->num_cols       = A->cmap->n;
1301       tempT->num_entries    = a->nz;
1302       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1303       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1304       tempT->values         = new THRUSTARRAY(a->nz);
1305 
1306       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1307                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1308       PetscCallCUSPARSE(stat);
1309 
1310       /* Last, convert CSC to HYB */
1311       cusparseHybMat_t hybMat;
1312       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1313       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1314       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1315       PetscCallCUSPARSE(stat);
1316 
1317       /* assign the pointer */
1318       matstructT->mat = hybMat;
1319       A->transupdated = PETSC_TRUE;
1320       /* delete temporaries */
1321       if (tempT) {
1322         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1323         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1324         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1325         delete (CsrMatrix *)tempT;
1326       }
1327       if (temp) {
1328         if (temp->values) delete (THRUSTARRAY *)temp->values;
1329         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1330         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1331         delete (CsrMatrix *)temp;
1332       }
1333 #endif
1334     }
1335   }
1336   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1337     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1338     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1339     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1340     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1341     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1342     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1343     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1344     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1345     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1346     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1347     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1348       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1349       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1350       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1351     }
1352     if (!cusparsestruct->csr2csc_i) {
1353       THRUSTARRAY csr2csc_a(matrix->num_entries);
1354       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1355 
1356       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1357 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1358       void  *csr2cscBuffer;
1359       size_t csr2cscBufferSize;
1360       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1361                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1362       PetscCallCUSPARSE(stat);
1363       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1364 #endif
1365 
1366       if (matrix->num_entries) {
1367         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1368            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1369            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1370 
1371            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1372            should be filled with indexBase. So I just take a shortcut here.
1373         */
1374         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1375 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1376                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1377         PetscCallCUSPARSE(stat);
1378 #else
1379                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1380         PetscCallCUSPARSE(stat);
1381 #endif
1382       } else {
1383         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1384       }
1385 
1386       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1387       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1388 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1389       PetscCallCUDA(cudaFree(csr2cscBuffer));
1390 #endif
1391     }
1392     PetscCallThrust(
1393       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1394   }
1395   PetscCall(PetscLogGpuTimeEnd());
1396   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1397   /* the compressed row indices is not used for matTranspose */
1398   matstructT->cprowIndices = NULL;
1399   /* assign the pointer */
1400   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1401   A->transupdated                                = PETSC_TRUE;
1402   PetscFunctionReturn(PETSC_SUCCESS);
1403 }
1404 
1405 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1406 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1407 {
1408   const PetscScalar                    *barray;
1409   PetscScalar                          *xarray;
1410   thrust::device_ptr<const PetscScalar> bGPU;
1411   thrust::device_ptr<PetscScalar>       xGPU;
1412   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1413   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1414   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1415   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1416   PetscInt                              m   = A->rmap->n;
1417 
1418   PetscFunctionBegin;
1419   PetscCall(PetscLogGpuTimeBegin());
1420   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1421   PetscCall(VecCUDAGetArrayRead(b, &barray));
1422   xGPU = thrust::device_pointer_cast(xarray);
1423   bGPU = thrust::device_pointer_cast(barray);
1424 
1425   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1426   if (fs->rpermIndices) {
1427     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1428     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1429   } else {
1430     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1431   }
1432 
1433   // Solve L Y = X
1434   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1435   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1436   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1437 
1438   // Solve U X = Y
1439   if (fs->cpermIndices) {
1440     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1441   } else {
1442     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1443   }
1444   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1445 
1446   // Reorder X with the column permutation if needed, and put the result back to x
1447   if (fs->cpermIndices) {
1448     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1449                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1450   }
1451   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1452   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1453   PetscCall(PetscLogGpuTimeEnd());
1454   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1455   PetscFunctionReturn(PETSC_SUCCESS);
1456 }
1457 
1458 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1459 {
1460   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1461   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1462   const PetscScalar                    *barray;
1463   PetscScalar                          *xarray;
1464   thrust::device_ptr<const PetscScalar> bGPU;
1465   thrust::device_ptr<PetscScalar>       xGPU;
1466   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1467   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1468   PetscInt                              m   = A->rmap->n;
1469 
1470   PetscFunctionBegin;
1471   PetscCall(PetscLogGpuTimeBegin());
1472   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1473     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1474     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1475                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1476 
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1479     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1480     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1481     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1482   }
1483 
1484   if (!fs->updatedTransposeSpSVAnalysis) {
1485     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1486 
1487     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1488     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1489   }
1490 
1491   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1492   PetscCall(VecCUDAGetArrayRead(b, &barray));
1493   xGPU = thrust::device_pointer_cast(xarray);
1494   bGPU = thrust::device_pointer_cast(barray);
1495 
1496   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1497   if (fs->rpermIndices) {
1498     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1499     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1500   } else {
1501     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1502   }
1503 
1504   // Solve Ut Y = X
1505   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1506   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1507 
1508   // Solve Lt X = Y
1509   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1510     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1511   } else {
1512     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1513   }
1514   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1515 
1516   // Reorder X with the column permutation if needed, and put the result back to x
1517   if (fs->cpermIndices) {
1518     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1519                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1520   }
1521 
1522   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1523   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1524   PetscCall(PetscLogGpuTimeEnd());
1525   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1526   PetscFunctionReturn(PETSC_SUCCESS);
1527 }
1528 #else
1529 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1530 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1531 {
1532   PetscInt                              n = xx->map->n;
1533   const PetscScalar                    *barray;
1534   PetscScalar                          *xarray;
1535   thrust::device_ptr<const PetscScalar> bGPU;
1536   thrust::device_ptr<PetscScalar>       xGPU;
1537   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1538   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1539   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1540   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1541 
1542   PetscFunctionBegin;
1543   /* Analyze the matrix and create the transpose ... on the fly */
1544   if (!loTriFactorT && !upTriFactorT) {
1545     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1546     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1547     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1548   }
1549 
1550   /* Get the GPU pointers */
1551   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1552   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1553   xGPU = thrust::device_pointer_cast(xarray);
1554   bGPU = thrust::device_pointer_cast(barray);
1555 
1556   PetscCall(PetscLogGpuTimeBegin());
1557   /* First, reorder with the row permutation */
1558   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1559 
1560   /* First, solve U */
1561   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1562                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1563 
1564   /* Then, solve L */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1566                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1567 
1568   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1569   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1570 
1571   /* Copy the temporary to the full solution. */
1572   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1573 
1574   /* restore */
1575   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1576   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1577   PetscCall(PetscLogGpuTimeEnd());
1578   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1579   PetscFunctionReturn(PETSC_SUCCESS);
1580 }
1581 
1582 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1583 {
1584   const PetscScalar                 *barray;
1585   PetscScalar                       *xarray;
1586   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1587   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1588   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1589   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1590 
1591   PetscFunctionBegin;
1592   /* Analyze the matrix and create the transpose ... on the fly */
1593   if (!loTriFactorT && !upTriFactorT) {
1594     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1595     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1596     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1597   }
1598 
1599   /* Get the GPU pointers */
1600   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1601   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1602 
1603   PetscCall(PetscLogGpuTimeBegin());
1604   /* First, solve U */
1605   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1606                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1607 
1608   /* Then, solve L */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1610                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1611 
1612   /* restore */
1613   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1614   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1615   PetscCall(PetscLogGpuTimeEnd());
1616   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1617   PetscFunctionReturn(PETSC_SUCCESS);
1618 }
1619 
1620 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1621 {
1622   const PetscScalar                    *barray;
1623   PetscScalar                          *xarray;
1624   thrust::device_ptr<const PetscScalar> bGPU;
1625   thrust::device_ptr<PetscScalar>       xGPU;
1626   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1627   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1628   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1629   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1630 
1631   PetscFunctionBegin;
1632   /* Get the GPU pointers */
1633   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1634   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1635   xGPU = thrust::device_pointer_cast(xarray);
1636   bGPU = thrust::device_pointer_cast(barray);
1637 
1638   PetscCall(PetscLogGpuTimeBegin());
1639   /* First, reorder with the row permutation */
1640   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1641 
1642   /* Next, solve L */
1643   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1644                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1645 
1646   /* Then, solve U */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1648                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1649 
1650   /* Last, reorder with the column permutation */
1651   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1652 
1653   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1654   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1655   PetscCall(PetscLogGpuTimeEnd());
1656   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1657   PetscFunctionReturn(PETSC_SUCCESS);
1658 }
1659 
1660 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1661 {
1662   const PetscScalar                 *barray;
1663   PetscScalar                       *xarray;
1664   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1665   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1667   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1668 
1669   PetscFunctionBegin;
1670   /* Get the GPU pointers */
1671   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1672   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1673 
1674   PetscCall(PetscLogGpuTimeBegin());
1675   /* First, solve L */
1676   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1677                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1678 
1679   /* Next, solve U */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1681                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1682 
1683   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1684   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1685   PetscCall(PetscLogGpuTimeEnd());
1686   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1687   PetscFunctionReturn(PETSC_SUCCESS);
1688 }
1689 #endif
1690 
1691 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1692 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1693 {
1694   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1695   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1696   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1697   CsrMatrix                    *Acsr;
1698   PetscInt                      m, nz;
1699   PetscBool                     flg;
1700 
1701   PetscFunctionBegin;
1702   if (PetscDefined(USE_DEBUG)) {
1703     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1704     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1705   }
1706 
1707   /* Copy A's value to fact */
1708   m  = fact->rmap->n;
1709   nz = aij->nz;
1710   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1711   Acsr = (CsrMatrix *)Acusp->mat->mat;
1712   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1713 
1714   PetscCall(PetscLogGpuTimeBegin());
1715   /* Factorize fact inplace */
1716   if (m)
1717     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1718                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1719   if (PetscDefined(USE_DEBUG)) {
1720     int              numerical_zero;
1721     cusparseStatus_t status;
1722     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1723     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1724   }
1725 
1726   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1727   if (fs->updatedSpSVAnalysis) {
1728     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1729     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1730   } else
1731   #endif
1732   {
1733     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1734      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1735     */
1736     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1737 
1738     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1739 
1740     fs->updatedSpSVAnalysis = PETSC_TRUE;
1741     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1742     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1743   }
1744 
1745   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1746   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1747   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1748   fact->ops->matsolve          = NULL;
1749   fact->ops->matsolvetranspose = NULL;
1750   PetscCall(PetscLogGpuTimeEnd());
1751   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1752   PetscFunctionReturn(PETSC_SUCCESS);
1753 }
1754 
1755 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1756 {
1757   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1758   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1759   PetscInt                      m, nz;
1760 
1761   PetscFunctionBegin;
1762   if (PetscDefined(USE_DEBUG)) {
1763     PetscBool flg, diagDense;
1764 
1765     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1766     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1767     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1768     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1769     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1770   }
1771 
1772   /* Free the old stale stuff */
1773   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1774 
1775   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1776      but they will not be used. Allocate them just for easy debugging.
1777    */
1778   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1779 
1780   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1781   fact->factortype             = MAT_FACTOR_ILU;
1782   fact->info.factor_mallocs    = 0;
1783   fact->info.fill_ratio_given  = info->fill;
1784   fact->info.fill_ratio_needed = 1.0;
1785 
1786   aij->row = NULL;
1787   aij->col = NULL;
1788 
1789   /* ====================================================================== */
1790   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1791   /* We'll do in-place factorization on fact                                */
1792   /* ====================================================================== */
1793   const int *Ai, *Aj;
1794 
1795   m  = fact->rmap->n;
1796   nz = aij->nz;
1797 
1798   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1799   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1800   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1801   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1802   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1803   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1804 
1805   /* ====================================================================== */
1806   /* Create descriptors for M, L, U                                         */
1807   /* ====================================================================== */
1808   cusparseFillMode_t fillMode;
1809   cusparseDiagType_t diagType;
1810 
1811   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1812   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1813   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1814 
1815   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1816     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1817     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1818     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1819     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1820   */
1821   fillMode = CUSPARSE_FILL_MODE_LOWER;
1822   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1823   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826 
1827   fillMode = CUSPARSE_FILL_MODE_UPPER;
1828   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1829   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1830   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1831   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1832 
1833   /* ========================================================================= */
1834   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1835   /* ========================================================================= */
1836   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1837   if (m)
1838     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1839                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1840 
1841   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1842   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1843 
1844   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1845   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1846 
1847   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1848   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1849 
1850   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1851   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1852 
1853   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1854      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1855      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1856      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1857    */
1858   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1859     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1860     fs->spsvBuffer_L = fs->factBuffer_M;
1861     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1862   } else {
1863     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1864     fs->spsvBuffer_U = fs->factBuffer_M;
1865     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1866   }
1867 
1868   /* ========================================================================== */
1869   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1870   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1871   /* ========================================================================== */
1872   int              structural_zero;
1873   cusparseStatus_t status;
1874 
1875   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1876   if (m)
1877     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1878                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1879   if (PetscDefined(USE_DEBUG)) {
1880     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1881     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1882     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1883   }
1884 
1885   /* Estimate FLOPs of the numeric factorization */
1886   {
1887     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1888     PetscInt       *Ai, nzRow, nzLeft;
1889     const PetscInt *adiag;
1890     PetscLogDouble  flops = 0.0;
1891 
1892     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1893     Ai = Aseq->i;
1894     for (PetscInt i = 0; i < m; i++) {
1895       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1896         nzRow  = Ai[i + 1] - Ai[i];
1897         nzLeft = adiag[i] - Ai[i];
1898         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1899           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1900         */
1901         nzLeft = (nzRow - 1) / 2;
1902         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1903       }
1904     }
1905     fs->numericFactFlops = flops;
1906   }
1907   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1908   PetscFunctionReturn(PETSC_SUCCESS);
1909 }
1910 
1911 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1912 {
1913   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1914   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1915   const PetscScalar            *barray;
1916   PetscScalar                  *xarray;
1917 
1918   PetscFunctionBegin;
1919   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1920   PetscCall(VecCUDAGetArrayRead(b, &barray));
1921   PetscCall(PetscLogGpuTimeBegin());
1922 
1923   /* Solve L*y = b */
1924   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1925   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1926   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1927                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1928 
1929   /* Solve Lt*x = y */
1930   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1931   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1932                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1933 
1934   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1935   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1936 
1937   PetscCall(PetscLogGpuTimeEnd());
1938   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1939   PetscFunctionReturn(PETSC_SUCCESS);
1940 }
1941 
1942 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1943 {
1944   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1945   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1946   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1947   CsrMatrix                    *Acsr;
1948   PetscInt                      m, nz;
1949   PetscBool                     flg;
1950 
1951   PetscFunctionBegin;
1952   if (PetscDefined(USE_DEBUG)) {
1953     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1954     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1955   }
1956 
1957   /* Copy A's value to fact */
1958   m  = fact->rmap->n;
1959   nz = aij->nz;
1960   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1961   Acsr = (CsrMatrix *)Acusp->mat->mat;
1962   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1963 
1964   /* Factorize fact inplace */
1965   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1966      csric02() only takes the lower triangular part of matrix A to perform factorization.
1967      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1968      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1969      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1970    */
1971   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1972   if (PetscDefined(USE_DEBUG)) {
1973     int              numerical_zero;
1974     cusparseStatus_t status;
1975     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1976     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1977   }
1978 
1979   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1980   if (fs->updatedSpSVAnalysis) {
1981     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1982     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1983   } else
1984   #endif
1985   {
1986     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1987 
1988     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1989     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1990   */
1991     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1992     fs->updatedSpSVAnalysis = PETSC_TRUE;
1993   }
1994 
1995   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1996   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1997   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1998   fact->ops->matsolve          = NULL;
1999   fact->ops->matsolvetranspose = NULL;
2000   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2001   PetscFunctionReturn(PETSC_SUCCESS);
2002 }
2003 
2004 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2005 {
2006   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2007   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2008   PetscInt                      m, nz;
2009 
2010   PetscFunctionBegin;
2011   if (PetscDefined(USE_DEBUG)) {
2012     PetscBool flg, diagDense;
2013 
2014     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2015     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2016     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2017     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2018     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2019   }
2020 
2021   /* Free the old stale stuff */
2022   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2023 
2024   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2025      but they will not be used. Allocate them just for easy debugging.
2026    */
2027   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2028 
2029   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2030   fact->factortype             = MAT_FACTOR_ICC;
2031   fact->info.factor_mallocs    = 0;
2032   fact->info.fill_ratio_given  = info->fill;
2033   fact->info.fill_ratio_needed = 1.0;
2034 
2035   aij->row = NULL;
2036   aij->col = NULL;
2037 
2038   /* ====================================================================== */
2039   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2040   /* We'll do in-place factorization on fact                                */
2041   /* ====================================================================== */
2042   const int *Ai, *Aj;
2043 
2044   m  = fact->rmap->n;
2045   nz = aij->nz;
2046 
2047   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2048   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2049   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2050   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2051   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2052   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2053 
2054   /* ====================================================================== */
2055   /* Create mat descriptors for M, L                                        */
2056   /* ====================================================================== */
2057   cusparseFillMode_t fillMode;
2058   cusparseDiagType_t diagType;
2059 
2060   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2061   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2062   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2063 
2064   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2065     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2066     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2067     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2068     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2069   */
2070   fillMode = CUSPARSE_FILL_MODE_LOWER;
2071   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2072   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2073   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2074   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2075 
2076   /* ========================================================================= */
2077   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2078   /* ========================================================================= */
2079   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2080   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2081 
2082   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2083   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2084 
2085   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2086   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2087 
2088   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2089   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2090 
2091   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2092   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2093 
2094   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2095      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2096    */
2097   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2098     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2099     fs->spsvBuffer_L = fs->factBuffer_M;
2100     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2101   } else {
2102     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2103     fs->spsvBuffer_Lt = fs->factBuffer_M;
2104     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2105   }
2106 
2107   /* ========================================================================== */
2108   /* Perform analysis of ic0 on M                                               */
2109   /* The lower triangular part of M has the same sparsity pattern as L          */
2110   /* ========================================================================== */
2111   int              structural_zero;
2112   cusparseStatus_t status;
2113 
2114   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2115   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2116   if (PetscDefined(USE_DEBUG)) {
2117     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2118     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2119     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2120   }
2121 
2122   /* Estimate FLOPs of the numeric factorization */
2123   {
2124     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2125     PetscInt      *Ai, nzRow, nzLeft;
2126     PetscLogDouble flops = 0.0;
2127 
2128     Ai = Aseq->i;
2129     for (PetscInt i = 0; i < m; i++) {
2130       nzRow = Ai[i + 1] - Ai[i];
2131       if (nzRow > 1) {
2132         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2133           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2134         */
2135         nzLeft = (nzRow - 1) / 2;
2136         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2137       }
2138     }
2139     fs->numericFactFlops = flops;
2140   }
2141   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2142   PetscFunctionReturn(PETSC_SUCCESS);
2143 }
2144 #endif
2145 
2146 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2147 {
2148   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2149   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2150 
2151   PetscFunctionBegin;
2152   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2153   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2154   B->offloadmask = PETSC_OFFLOAD_CPU;
2155 
2156   if (!cusparsestruct->use_cpu_solve) {
2157 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2158     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2159     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2160 #else
2161     /* determine which version of MatSolve needs to be used. */
2162     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2163     IS          isrow = b->row, iscol = b->col;
2164     PetscBool   row_identity, col_identity;
2165 
2166     PetscCall(ISIdentity(isrow, &row_identity));
2167     PetscCall(ISIdentity(iscol, &col_identity));
2168     if (row_identity && col_identity) {
2169       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2170       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2171     } else {
2172       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2173       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2174     }
2175 #endif
2176   }
2177   B->ops->matsolve          = NULL;
2178   B->ops->matsolvetranspose = NULL;
2179 
2180   /* get the triangular factors */
2181   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2182   PetscFunctionReturn(PETSC_SUCCESS);
2183 }
2184 
2185 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186 {
2187   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2188 
2189   PetscFunctionBegin;
2190   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2191   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2192   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2193   PetscFunctionReturn(PETSC_SUCCESS);
2194 }
2195 
2196 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2197 {
2198   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2199 
2200   PetscFunctionBegin;
2201 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2202   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2203   if (!info->factoronhost) {
2204     PetscCall(ISIdentity(isrow, &row_identity));
2205     PetscCall(ISIdentity(iscol, &col_identity));
2206   }
2207   if (!info->levels && row_identity && col_identity) {
2208     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2209   } else
2210 #endif
2211   {
2212     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2213     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2214     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2215   }
2216   PetscFunctionReturn(PETSC_SUCCESS);
2217 }
2218 
2219 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2220 {
2221   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2222 
2223   PetscFunctionBegin;
2224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2225   PetscBool perm_identity = PETSC_FALSE;
2226   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2227   if (!info->levels && perm_identity) {
2228     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2229   } else
2230 #endif
2231   {
2232     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2233     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2234     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2235   }
2236   PetscFunctionReturn(PETSC_SUCCESS);
2237 }
2238 
2239 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2240 {
2241   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2242 
2243   PetscFunctionBegin;
2244   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2245   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2246   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2247   PetscFunctionReturn(PETSC_SUCCESS);
2248 }
2249 
2250 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2251 {
2252   PetscFunctionBegin;
2253   *type = MATSOLVERCUSPARSE;
2254   PetscFunctionReturn(PETSC_SUCCESS);
2255 }
2256 
2257 /*MC
2258   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2259   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2260   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2261   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2262   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2263   algorithms are not recommended. This class does NOT support direct solver operations.
2264 
2265   Level: beginner
2266 
2267 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2268           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2269 M*/
2270 
2271 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2272 {
2273   PetscInt n = A->rmap->n;
2274 
2275   PetscFunctionBegin;
2276   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2277   PetscCall(MatSetSizes(*B, n, n, n, n));
2278   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2279   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2280 
2281   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2282   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2283     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2284     if (!A->boundtocpu) {
2285       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2286       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2287     } else {
2288       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2289       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2290     }
2291     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2292     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2293     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2294   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2295     if (!A->boundtocpu) {
2296       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2297       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2298     } else {
2299       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2300       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2301     }
2302     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2303     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2304   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2305 
2306   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2307   (*B)->canuseordering = PETSC_TRUE;
2308   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2309   PetscFunctionReturn(PETSC_SUCCESS);
2310 }
2311 
2312 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2313 {
2314   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2315   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2316 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2317   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2318 #endif
2319 
2320   PetscFunctionBegin;
2321   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2322     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2323     if (A->factortype == MAT_FACTOR_NONE) {
2324       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2325       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2326     }
2327 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2328     else if (fs->csrVal) {
2329       /* We have a factorized matrix on device and are able to copy it to host */
2330       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331     }
2332 #endif
2333     else
2334       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2335     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2336     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2337     A->offloadmask = PETSC_OFFLOAD_BOTH;
2338   }
2339   PetscFunctionReturn(PETSC_SUCCESS);
2340 }
2341 
2342 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2343 {
2344   PetscFunctionBegin;
2345   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2346   *array = ((Mat_SeqAIJ *)A->data)->a;
2347   PetscFunctionReturn(PETSC_SUCCESS);
2348 }
2349 
2350 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2351 {
2352   PetscFunctionBegin;
2353   A->offloadmask = PETSC_OFFLOAD_CPU;
2354   *array         = NULL;
2355   PetscFunctionReturn(PETSC_SUCCESS);
2356 }
2357 
2358 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2359 {
2360   PetscFunctionBegin;
2361   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2362   *array = ((Mat_SeqAIJ *)A->data)->a;
2363   PetscFunctionReturn(PETSC_SUCCESS);
2364 }
2365 
2366 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2367 {
2368   PetscFunctionBegin;
2369   *array = NULL;
2370   PetscFunctionReturn(PETSC_SUCCESS);
2371 }
2372 
2373 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2374 {
2375   PetscFunctionBegin;
2376   *array = ((Mat_SeqAIJ *)A->data)->a;
2377   PetscFunctionReturn(PETSC_SUCCESS);
2378 }
2379 
2380 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2381 {
2382   PetscFunctionBegin;
2383   A->offloadmask = PETSC_OFFLOAD_CPU;
2384   *array         = NULL;
2385   PetscFunctionReturn(PETSC_SUCCESS);
2386 }
2387 
2388 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2389 {
2390   Mat_SeqAIJCUSPARSE *cusp;
2391   CsrMatrix          *matrix;
2392 
2393   PetscFunctionBegin;
2394   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2395   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2396   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2397   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2398   matrix = (CsrMatrix *)cusp->mat->mat;
2399 
2400   if (i) {
2401 #if !defined(PETSC_USE_64BIT_INDICES)
2402     *i = matrix->row_offsets->data().get();
2403 #else
2404     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2405 #endif
2406   }
2407   if (j) {
2408 #if !defined(PETSC_USE_64BIT_INDICES)
2409     *j = matrix->column_indices->data().get();
2410 #else
2411     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2412 #endif
2413   }
2414   if (a) *a = matrix->values->data().get();
2415   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2416   PetscFunctionReturn(PETSC_SUCCESS);
2417 }
2418 
2419 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2420 {
2421   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2422   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2423   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2424   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2425   cusparseStatus_t              stat;
2426   PetscBool                     both = PETSC_TRUE;
2427 
2428   PetscFunctionBegin;
2429   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2430   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2431     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2432       CsrMatrix *matrix;
2433       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2434 
2435       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2436       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2437       matrix->values->assign(a->a, a->a + a->nz);
2438       PetscCallCUDA(WaitForCUDA());
2439       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2440       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2441       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2442     } else {
2443       PetscInt nnz;
2444       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2445       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2446       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2447       delete cusparsestruct->workVector;
2448       delete cusparsestruct->rowoffsets_gpu;
2449       cusparsestruct->workVector     = NULL;
2450       cusparsestruct->rowoffsets_gpu = NULL;
2451       try {
2452         if (a->compressedrow.use) {
2453           m    = a->compressedrow.nrows;
2454           ii   = a->compressedrow.i;
2455           ridx = a->compressedrow.rindex;
2456         } else {
2457           m    = A->rmap->n;
2458           ii   = a->i;
2459           ridx = NULL;
2460         }
2461         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2462         if (!a->a) {
2463           nnz  = ii[m];
2464           both = PETSC_FALSE;
2465         } else nnz = a->nz;
2466         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2467 
2468         /* create cusparse matrix */
2469         cusparsestruct->nrows = m;
2470         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2471         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2472         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2473         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2474 
2475         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2476         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2477         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2478         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2481         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2482 
2483         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2484         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2485           /* set the matrix */
2486           CsrMatrix *mat   = new CsrMatrix;
2487           mat->num_rows    = m;
2488           mat->num_cols    = A->cmap->n;
2489           mat->num_entries = nnz;
2490           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2491           mat->row_offsets->assign(ii, ii + m + 1);
2492           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2493           mat->column_indices->assign(a->j, a->j + nnz);
2494 
2495           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2496           if (a->a) mat->values->assign(a->a, a->a + nnz);
2497 
2498           /* assign the pointer */
2499           matstruct->mat = mat;
2500 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2501           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2502             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2503                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2504             PetscCallCUSPARSE(stat);
2505           }
2506 #endif
2507         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2508 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2509           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2510 #else
2511           CsrMatrix *mat   = new CsrMatrix;
2512           mat->num_rows    = m;
2513           mat->num_cols    = A->cmap->n;
2514           mat->num_entries = nnz;
2515           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2516           mat->row_offsets->assign(ii, ii + m + 1);
2517 
2518           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2519           mat->column_indices->assign(a->j, a->j + nnz);
2520 
2521           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2522           if (a->a) mat->values->assign(a->a, a->a + nnz);
2523 
2524           cusparseHybMat_t hybMat;
2525           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2526           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2527           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2528           PetscCallCUSPARSE(stat);
2529           /* assign the pointer */
2530           matstruct->mat = hybMat;
2531 
2532           if (mat) {
2533             if (mat->values) delete (THRUSTARRAY *)mat->values;
2534             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2535             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2536             delete (CsrMatrix *)mat;
2537           }
2538 #endif
2539         }
2540 
2541         /* assign the compressed row indices */
2542         if (a->compressedrow.use) {
2543           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2544           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2545           matstruct->cprowIndices->assign(ridx, ridx + m);
2546           tmp = m;
2547         } else {
2548           cusparsestruct->workVector = NULL;
2549           matstruct->cprowIndices    = NULL;
2550           tmp                        = 0;
2551         }
2552         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2553 
2554         /* assign the pointer */
2555         cusparsestruct->mat = matstruct;
2556       } catch (char *ex) {
2557         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2558       }
2559       PetscCallCUDA(WaitForCUDA());
2560       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2561       cusparsestruct->nonzerostate = A->nonzerostate;
2562     }
2563     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2564   }
2565   PetscFunctionReturn(PETSC_SUCCESS);
2566 }
2567 
2568 struct VecCUDAPlusEquals {
2569   template <typename Tuple>
2570   __host__ __device__ void operator()(Tuple t)
2571   {
2572     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2573   }
2574 };
2575 
2576 struct VecCUDAEquals {
2577   template <typename Tuple>
2578   __host__ __device__ void operator()(Tuple t)
2579   {
2580     thrust::get<1>(t) = thrust::get<0>(t);
2581   }
2582 };
2583 
2584 struct VecCUDAEqualsReverse {
2585   template <typename Tuple>
2586   __host__ __device__ void operator()(Tuple t)
2587   {
2588     thrust::get<0>(t) = thrust::get<1>(t);
2589   }
2590 };
2591 
2592 struct MatProductCtx_MatMatCusparse {
2593   PetscBool      cisdense;
2594   PetscScalar   *Bt;
2595   Mat            X;
2596   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2597   PetscLogDouble flops;
2598   CsrMatrix     *Bcsr;
2599 
2600 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2601   cusparseSpMatDescr_t matSpBDescr;
2602   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2603   cusparseDnMatDescr_t matBDescr;
2604   cusparseDnMatDescr_t matCDescr;
2605   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2606   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2607   void *dBuffer4;
2608   void *dBuffer5;
2609   #endif
2610   size_t                mmBufferSize;
2611   void                 *mmBuffer;
2612   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2613   cusparseSpGEMMDescr_t spgemmDesc;
2614 #endif
2615 };
2616 
2617 static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(void **data)
2618 {
2619   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2620 
2621   PetscFunctionBegin;
2622   PetscCallCUDA(cudaFree(mmdata->Bt));
2623   delete mmdata->Bcsr;
2624 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2625   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2626   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2627   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2628   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2629   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2630   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2631   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2632   #endif
2633   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2634   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2635 #endif
2636   PetscCall(MatDestroy(&mmdata->X));
2637   PetscCall(PetscFree(*data));
2638   PetscFunctionReturn(PETSC_SUCCESS);
2639 }
2640 
2641 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2642 
2643 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2644 {
2645   Mat_Product                  *product = C->product;
2646   Mat                           A, B;
2647   PetscInt                      m, n, blda, clda;
2648   PetscBool                     flg, biscuda;
2649   Mat_SeqAIJCUSPARSE           *cusp;
2650   cusparseStatus_t              stat;
2651   cusparseOperation_t           opA;
2652   const PetscScalar            *barray;
2653   PetscScalar                  *carray;
2654   MatProductCtx_MatMatCusparse *mmdata;
2655   Mat_SeqAIJCUSPARSEMultStruct *mat;
2656   CsrMatrix                    *csrmat;
2657 
2658   PetscFunctionBegin;
2659   MatCheckProduct(C, 1);
2660   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2661   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2662   A      = product->A;
2663   B      = product->B;
2664   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2665   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2666   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2667      Instead of silently accepting the wrong answer, I prefer to raise the error */
2668   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2669   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2670   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2671   switch (product->type) {
2672   case MATPRODUCT_AB:
2673   case MATPRODUCT_PtAP:
2674     mat = cusp->mat;
2675     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2676     m   = A->rmap->n;
2677     n   = B->cmap->n;
2678     break;
2679   case MATPRODUCT_AtB:
2680     if (!A->form_explicit_transpose) {
2681       mat = cusp->mat;
2682       opA = CUSPARSE_OPERATION_TRANSPOSE;
2683     } else {
2684       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2685       mat = cusp->matTranspose;
2686       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2687     }
2688     m = A->cmap->n;
2689     n = B->cmap->n;
2690     break;
2691   case MATPRODUCT_ABt:
2692   case MATPRODUCT_RARt:
2693     mat = cusp->mat;
2694     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2695     m   = A->rmap->n;
2696     n   = B->rmap->n;
2697     break;
2698   default:
2699     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2700   }
2701   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2702   csrmat = (CsrMatrix *)mat->mat;
2703   /* if the user passed a CPU matrix, copy the data to the GPU */
2704   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2705   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2706   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2707 
2708   PetscCall(MatDenseGetLDA(B, &blda));
2709   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2710     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2711     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2712   } else {
2713     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2714     PetscCall(MatDenseGetLDA(C, &clda));
2715   }
2716 
2717   PetscCall(PetscLogGpuTimeBegin());
2718 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2719   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2720   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2721   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2722   #else
2723   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2724   #endif
2725 
2726   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2727   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2728     size_t mmBufferSize;
2729     if (mmdata->initialized && mmdata->Blda != blda) {
2730       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2731       mmdata->matBDescr = NULL;
2732     }
2733     if (!mmdata->matBDescr) {
2734       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2735       mmdata->Blda = blda;
2736     }
2737 
2738     if (mmdata->initialized && mmdata->Clda != clda) {
2739       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2740       mmdata->matCDescr = NULL;
2741     }
2742     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2743       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2744       mmdata->Clda = clda;
2745     }
2746 
2747   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2748     if (matADescr) {
2749       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2750       matADescr = NULL;
2751     }
2752   #endif
2753 
2754     if (!matADescr) {
2755       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2756                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2757       PetscCallCUSPARSE(stat);
2758     }
2759 
2760     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2761 
2762     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2763       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2764       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2765       mmdata->mmBufferSize = mmBufferSize;
2766     }
2767 
2768   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2769     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2770   #endif
2771 
2772     mmdata->initialized = PETSC_TRUE;
2773   } else {
2774     /* to be safe, always update pointers of the mats */
2775     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2776     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2777     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2778   }
2779 
2780   /* do cusparseSpMM, which supports transpose on B */
2781   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2782 #else
2783   PetscInt k;
2784   /* cusparseXcsrmm does not support transpose on B */
2785   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2786     cublasHandle_t cublasv2handle;
2787     cublasStatus_t cerr;
2788 
2789     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2790     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2791     PetscCallCUBLAS(cerr);
2792     blda = B->cmap->n;
2793     k    = B->cmap->n;
2794   } else {
2795     k = B->rmap->n;
2796   }
2797 
2798   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2799   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2800   PetscCallCUSPARSE(stat);
2801 #endif
2802   PetscCall(PetscLogGpuTimeEnd());
2803   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2804   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2805   if (product->type == MATPRODUCT_RARt) {
2806     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2807     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2808   } else if (product->type == MATPRODUCT_PtAP) {
2809     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2810     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2811   } else {
2812     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2813   }
2814   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2815   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2816   PetscFunctionReturn(PETSC_SUCCESS);
2817 }
2818 
2819 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2820 {
2821   Mat_Product                  *product = C->product;
2822   Mat                           A, B;
2823   PetscInt                      m, n;
2824   PetscBool                     cisdense, flg;
2825   MatProductCtx_MatMatCusparse *mmdata;
2826   Mat_SeqAIJCUSPARSE           *cusp;
2827 
2828   PetscFunctionBegin;
2829   MatCheckProduct(C, 1);
2830   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2831   A = product->A;
2832   B = product->B;
2833   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2834   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2835   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2836   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2837   switch (product->type) {
2838   case MATPRODUCT_AB:
2839     m = A->rmap->n;
2840     n = B->cmap->n;
2841     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2842     break;
2843   case MATPRODUCT_AtB:
2844     m = A->cmap->n;
2845     n = B->cmap->n;
2846     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2847     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2848     break;
2849   case MATPRODUCT_ABt:
2850     m = A->rmap->n;
2851     n = B->rmap->n;
2852     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2853     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2854     break;
2855   case MATPRODUCT_PtAP:
2856     m = B->cmap->n;
2857     n = B->cmap->n;
2858     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2859     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2860     break;
2861   case MATPRODUCT_RARt:
2862     m = B->rmap->n;
2863     n = B->rmap->n;
2864     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2865     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2866     break;
2867   default:
2868     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2869   }
2870   PetscCall(MatSetSizes(C, m, n, m, n));
2871   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2872   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2873   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2874 
2875   /* product data */
2876   PetscCall(PetscNew(&mmdata));
2877   mmdata->cisdense = cisdense;
2878 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2879   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2880   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2881 #endif
2882   /* for these products we need intermediate storage */
2883   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2884     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2885     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2886     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2887       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2888     } else {
2889       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2890     }
2891   }
2892   C->product->data    = mmdata;
2893   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2894 
2895   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2896   PetscFunctionReturn(PETSC_SUCCESS);
2897 }
2898 
2899 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2900 {
2901   Mat_Product                  *product = C->product;
2902   Mat                           A, B;
2903   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2904   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2905   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2906   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2907   PetscBool                     flg;
2908   cusparseStatus_t              stat;
2909   MatProductType                ptype;
2910   MatProductCtx_MatMatCusparse *mmdata;
2911 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2912   cusparseSpMatDescr_t BmatSpDescr;
2913 #endif
2914   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2915 
2916   PetscFunctionBegin;
2917   MatCheckProduct(C, 1);
2918   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2919   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2920   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2921   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2922   A      = product->A;
2923   B      = product->B;
2924   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2925     mmdata->reusesym = PETSC_FALSE;
2926     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2927     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2928     Cmat = Ccusp->mat;
2929     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2930     Ccsr = (CsrMatrix *)Cmat->mat;
2931     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2932     goto finalize;
2933   }
2934   if (!c->nz) goto finalize;
2935   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2936   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2937   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2938   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2939   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2940   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2941   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2942   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2943   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2944   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2945   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2946   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2947   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2948   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2949 
2950   ptype = product->type;
2951   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2952     ptype = MATPRODUCT_AB;
2953     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2954   }
2955   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2956     ptype = MATPRODUCT_AB;
2957     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2958   }
2959   switch (ptype) {
2960   case MATPRODUCT_AB:
2961     Amat = Acusp->mat;
2962     Bmat = Bcusp->mat;
2963     break;
2964   case MATPRODUCT_AtB:
2965     Amat = Acusp->matTranspose;
2966     Bmat = Bcusp->mat;
2967     break;
2968   case MATPRODUCT_ABt:
2969     Amat = Acusp->mat;
2970     Bmat = Bcusp->matTranspose;
2971     break;
2972   default:
2973     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2974   }
2975   Cmat = Ccusp->mat;
2976   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2977   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2978   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2979   Acsr = (CsrMatrix *)Amat->mat;
2980   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2981   Ccsr = (CsrMatrix *)Cmat->mat;
2982   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2983   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2984   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2985   PetscCall(PetscLogGpuTimeBegin());
2986 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2987   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2988   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2989   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2990   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2991   PetscCallCUSPARSE(stat);
2992   #else
2993   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2994   PetscCallCUSPARSE(stat);
2995   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2996   PetscCallCUSPARSE(stat);
2997   #endif
2998 #else
2999   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3000                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3001   PetscCallCUSPARSE(stat);
3002 #endif
3003   PetscCall(PetscLogGpuFlops(mmdata->flops));
3004   PetscCallCUDA(WaitForCUDA());
3005   PetscCall(PetscLogGpuTimeEnd());
3006   C->offloadmask = PETSC_OFFLOAD_GPU;
3007 finalize:
3008   /* shorter version of MatAssemblyEnd_SeqAIJ */
3009   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3010   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3011   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3012   c->reallocs = 0;
3013   C->info.mallocs += 0;
3014   C->info.nz_unneeded = 0;
3015   C->assembled = C->was_assembled = PETSC_TRUE;
3016   C->num_ass++;
3017   PetscFunctionReturn(PETSC_SUCCESS);
3018 }
3019 
3020 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3021 {
3022   Mat_Product                  *product = C->product;
3023   Mat                           A, B;
3024   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3025   Mat_SeqAIJ                   *a, *b, *c;
3026   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3027   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3028   PetscInt                      i, j, m, n, k;
3029   PetscBool                     flg;
3030   cusparseStatus_t              stat;
3031   MatProductType                ptype;
3032   MatProductCtx_MatMatCusparse *mmdata;
3033   PetscLogDouble                flops;
3034   PetscBool                     biscompressed, ciscompressed;
3035 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3036   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3037   cusparseSpMatDescr_t BmatSpDescr;
3038 #else
3039   int cnz;
3040 #endif
3041   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3042 
3043   PetscFunctionBegin;
3044   MatCheckProduct(C, 1);
3045   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3046   A = product->A;
3047   B = product->B;
3048   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3049   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3050   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3051   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3052   a = (Mat_SeqAIJ *)A->data;
3053   b = (Mat_SeqAIJ *)B->data;
3054   /* product data */
3055   PetscCall(PetscNew(&mmdata));
3056   C->product->data    = mmdata;
3057   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3058 
3059   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3060   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3061   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3062   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3063   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3064   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3065 
3066   ptype = product->type;
3067   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3068     ptype                                          = MATPRODUCT_AB;
3069     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3070   }
3071   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3072     ptype                                          = MATPRODUCT_AB;
3073     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3074   }
3075   biscompressed = PETSC_FALSE;
3076   ciscompressed = PETSC_FALSE;
3077   switch (ptype) {
3078   case MATPRODUCT_AB:
3079     m    = A->rmap->n;
3080     n    = B->cmap->n;
3081     k    = A->cmap->n;
3082     Amat = Acusp->mat;
3083     Bmat = Bcusp->mat;
3084     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3085     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3086     break;
3087   case MATPRODUCT_AtB:
3088     m = A->cmap->n;
3089     n = B->cmap->n;
3090     k = A->rmap->n;
3091     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3092     Amat = Acusp->matTranspose;
3093     Bmat = Bcusp->mat;
3094     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3095     break;
3096   case MATPRODUCT_ABt:
3097     m = A->rmap->n;
3098     n = B->rmap->n;
3099     k = A->cmap->n;
3100     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3101     Amat = Acusp->mat;
3102     Bmat = Bcusp->matTranspose;
3103     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3104     break;
3105   default:
3106     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3107   }
3108 
3109   /* create cusparse matrix */
3110   PetscCall(MatSetSizes(C, m, n, m, n));
3111   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3112   c     = (Mat_SeqAIJ *)C->data;
3113   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3114   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3115   Ccsr  = new CsrMatrix;
3116 
3117   c->compressedrow.use = ciscompressed;
3118   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3119     c->compressedrow.nrows = a->compressedrow.nrows;
3120     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3121     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3122     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3123     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3124     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3125   } else {
3126     c->compressedrow.nrows  = 0;
3127     c->compressedrow.i      = NULL;
3128     c->compressedrow.rindex = NULL;
3129     Ccusp->workVector       = NULL;
3130     Cmat->cprowIndices      = NULL;
3131   }
3132   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3133   Ccusp->mat        = Cmat;
3134   Ccusp->mat->mat   = Ccsr;
3135   Ccsr->num_rows    = Ccusp->nrows;
3136   Ccsr->num_cols    = n;
3137   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3138   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3139   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3140   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3141   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3142   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3143   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3144   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3145   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3146   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3147   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3148     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3149     c->nz                = 0;
3150     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3151     Ccsr->values         = new THRUSTARRAY(c->nz);
3152     goto finalizesym;
3153   }
3154 
3155   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3156   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3157   Acsr = (CsrMatrix *)Amat->mat;
3158   if (!biscompressed) {
3159     Bcsr = (CsrMatrix *)Bmat->mat;
3160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3161     BmatSpDescr = Bmat->matDescr;
3162 #endif
3163   } else { /* we need to use row offsets for the full matrix */
3164     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3165     Bcsr                 = new CsrMatrix;
3166     Bcsr->num_rows       = B->rmap->n;
3167     Bcsr->num_cols       = cBcsr->num_cols;
3168     Bcsr->num_entries    = cBcsr->num_entries;
3169     Bcsr->column_indices = cBcsr->column_indices;
3170     Bcsr->values         = cBcsr->values;
3171     if (!Bcusp->rowoffsets_gpu) {
3172       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3173       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3174       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3175     }
3176     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3177     mmdata->Bcsr      = Bcsr;
3178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3179     if (Bcsr->num_rows && Bcsr->num_cols) {
3180       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3181       PetscCallCUSPARSE(stat);
3182     }
3183     BmatSpDescr = mmdata->matSpBDescr;
3184 #endif
3185   }
3186   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3187   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3188   /* precompute flops count */
3189   if (ptype == MATPRODUCT_AB) {
3190     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3191       const PetscInt st = a->i[i];
3192       const PetscInt en = a->i[i + 1];
3193       for (j = st; j < en; j++) {
3194         const PetscInt brow = a->j[j];
3195         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3196       }
3197     }
3198   } else if (ptype == MATPRODUCT_AtB) {
3199     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3200       const PetscInt anzi = a->i[i + 1] - a->i[i];
3201       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3202       flops += (2. * anzi) * bnzi;
3203     }
3204   } else { /* TODO */
3205     flops = 0.;
3206   }
3207 
3208   mmdata->flops = flops;
3209   PetscCall(PetscLogGpuTimeBegin());
3210 
3211 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3212   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3213   // cuda-12.2 requires non-null csrRowOffsets
3214   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3215   PetscCallCUSPARSE(stat);
3216   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3217   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3218   {
3219     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3220      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3221   */
3222     void *dBuffer1 = NULL;
3223     void *dBuffer2 = NULL;
3224     void *dBuffer3 = NULL;
3225     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3226     size_t bufferSize1 = 0;
3227     size_t bufferSize2 = 0;
3228     size_t bufferSize3 = 0;
3229     size_t bufferSize4 = 0;
3230     size_t bufferSize5 = 0;
3231 
3232     /* ask bufferSize1 bytes for external memory */
3233     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3234     PetscCallCUSPARSE(stat);
3235     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3236     /* inspect the matrices A and B to understand the memory requirement for the next step */
3237     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3238     PetscCallCUSPARSE(stat);
3239 
3240     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3241     PetscCallCUSPARSE(stat);
3242     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3243     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3244     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3245     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3246     PetscCallCUSPARSE(stat);
3247     PetscCallCUDA(cudaFree(dBuffer1));
3248     PetscCallCUDA(cudaFree(dBuffer2));
3249 
3250     /* get matrix C non-zero entries C_nnz1 */
3251     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3252     c->nz = (PetscInt)C_nnz1;
3253     /* allocate matrix C */
3254     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3255     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3256     Ccsr->values = new THRUSTARRAY(c->nz);
3257     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3258     /* update matC with the new pointers */
3259     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3260     PetscCallCUSPARSE(stat);
3261 
3262     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3263     PetscCallCUSPARSE(stat);
3264     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3265     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3266     PetscCallCUSPARSE(stat);
3267     PetscCallCUDA(cudaFree(dBuffer3));
3268     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3269     PetscCallCUSPARSE(stat);
3270     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3271   }
3272   #else
3273   size_t bufSize2;
3274   /* ask bufferSize bytes for external memory */
3275   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3276   PetscCallCUSPARSE(stat);
3277   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3278   /* inspect the matrices A and B to understand the memory requirement for the next step */
3279   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3280   PetscCallCUSPARSE(stat);
3281   /* ask bufferSize again bytes for external memory */
3282   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3283   PetscCallCUSPARSE(stat);
3284   /* The CUSPARSE documentation is not clear, nor the API
3285      We need both buffers to perform the operations properly!
3286      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3287      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3288      is stored in the descriptor! What a messy API... */
3289   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3290   /* compute the intermediate product of A * B */
3291   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3292   PetscCallCUSPARSE(stat);
3293   /* get matrix C non-zero entries C_nnz1 */
3294   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3295   c->nz = (PetscInt)C_nnz1;
3296   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3297                       mmdata->mmBufferSize / 1024));
3298   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3299   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3300   Ccsr->values = new THRUSTARRAY(c->nz);
3301   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3302   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3303   PetscCallCUSPARSE(stat);
3304   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3305   PetscCallCUSPARSE(stat);
3306   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3307 #else
3308   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3309   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3310                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3311   PetscCallCUSPARSE(stat);
3312   c->nz                = cnz;
3313   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3314   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3315   Ccsr->values = new THRUSTARRAY(c->nz);
3316   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3317 
3318   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3319   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3320      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3321      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3322   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3323                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3324   PetscCallCUSPARSE(stat);
3325 #endif
3326   PetscCall(PetscLogGpuFlops(mmdata->flops));
3327   PetscCall(PetscLogGpuTimeEnd());
3328 finalizesym:
3329   c->free_a = PETSC_TRUE;
3330   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3331   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3332   c->free_ij = PETSC_TRUE;
3333   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3334     PetscInt      *d_i = c->i;
3335     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3336     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3337     ii = *Ccsr->row_offsets;
3338     jj = *Ccsr->column_indices;
3339     if (ciscompressed) d_i = c->compressedrow.i;
3340     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3341     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3342   } else {
3343     PetscInt *d_i = c->i;
3344     if (ciscompressed) d_i = c->compressedrow.i;
3345     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3347   }
3348   if (ciscompressed) { /* need to expand host row offsets */
3349     PetscInt r = 0;
3350     c->i[0]    = 0;
3351     for (k = 0; k < c->compressedrow.nrows; k++) {
3352       const PetscInt next = c->compressedrow.rindex[k];
3353       const PetscInt old  = c->compressedrow.i[k];
3354       for (; r < next; r++) c->i[r + 1] = old;
3355     }
3356     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3357   }
3358   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3359   PetscCall(PetscMalloc1(m, &c->ilen));
3360   PetscCall(PetscMalloc1(m, &c->imax));
3361   c->maxnz         = c->nz;
3362   c->nonzerorowcnt = 0;
3363   c->rmax          = 0;
3364   for (k = 0; k < m; k++) {
3365     const PetscInt nn = c->i[k + 1] - c->i[k];
3366     c->ilen[k] = c->imax[k] = nn;
3367     c->nonzerorowcnt += (PetscInt)!!nn;
3368     c->rmax = PetscMax(c->rmax, nn);
3369   }
3370   PetscCall(PetscMalloc1(c->nz, &c->a));
3371   Ccsr->num_entries = c->nz;
3372 
3373   C->nonzerostate++;
3374   PetscCall(PetscLayoutSetUp(C->rmap));
3375   PetscCall(PetscLayoutSetUp(C->cmap));
3376   Ccusp->nonzerostate = C->nonzerostate;
3377   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3378   C->preallocated     = PETSC_TRUE;
3379   C->assembled        = PETSC_FALSE;
3380   C->was_assembled    = PETSC_FALSE;
3381   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3382     mmdata->reusesym = PETSC_TRUE;
3383     C->offloadmask   = PETSC_OFFLOAD_GPU;
3384   }
3385   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3386   PetscFunctionReturn(PETSC_SUCCESS);
3387 }
3388 
3389 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3390 
3391 /* handles sparse or dense B */
3392 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3393 {
3394   Mat_Product *product = mat->product;
3395   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3396 
3397   PetscFunctionBegin;
3398   MatCheckProduct(mat, 1);
3399   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3400   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3401   if (product->type == MATPRODUCT_ABC) {
3402     Ciscusp = PETSC_FALSE;
3403     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3404   }
3405   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3406     PetscBool usecpu = PETSC_FALSE;
3407     switch (product->type) {
3408     case MATPRODUCT_AB:
3409       if (product->api_user) {
3410         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3411         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3412         PetscOptionsEnd();
3413       } else {
3414         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3415         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416         PetscOptionsEnd();
3417       }
3418       break;
3419     case MATPRODUCT_AtB:
3420       if (product->api_user) {
3421         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3422         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3423         PetscOptionsEnd();
3424       } else {
3425         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3426         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427         PetscOptionsEnd();
3428       }
3429       break;
3430     case MATPRODUCT_PtAP:
3431       if (product->api_user) {
3432         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3433         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3434         PetscOptionsEnd();
3435       } else {
3436         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3437         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438         PetscOptionsEnd();
3439       }
3440       break;
3441     case MATPRODUCT_RARt:
3442       if (product->api_user) {
3443         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3444         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3445         PetscOptionsEnd();
3446       } else {
3447         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3448         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449         PetscOptionsEnd();
3450       }
3451       break;
3452     case MATPRODUCT_ABC:
3453       if (product->api_user) {
3454         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3455         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3456         PetscOptionsEnd();
3457       } else {
3458         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3459         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460         PetscOptionsEnd();
3461       }
3462       break;
3463     default:
3464       break;
3465     }
3466     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3467   }
3468   /* dispatch */
3469   if (isdense) {
3470     switch (product->type) {
3471     case MATPRODUCT_AB:
3472     case MATPRODUCT_AtB:
3473     case MATPRODUCT_ABt:
3474     case MATPRODUCT_PtAP:
3475     case MATPRODUCT_RARt:
3476       if (product->A->boundtocpu) {
3477         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3478       } else {
3479         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3480       }
3481       break;
3482     case MATPRODUCT_ABC:
3483       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3484       break;
3485     default:
3486       break;
3487     }
3488   } else if (Biscusp && Ciscusp) {
3489     switch (product->type) {
3490     case MATPRODUCT_AB:
3491     case MATPRODUCT_AtB:
3492     case MATPRODUCT_ABt:
3493       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3494       break;
3495     case MATPRODUCT_PtAP:
3496     case MATPRODUCT_RARt:
3497     case MATPRODUCT_ABC:
3498       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3499       break;
3500     default:
3501       break;
3502     }
3503   } else { /* fallback for AIJ */
3504     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3505   }
3506   PetscFunctionReturn(PETSC_SUCCESS);
3507 }
3508 
3509 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510 {
3511   PetscFunctionBegin;
3512   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3513   PetscFunctionReturn(PETSC_SUCCESS);
3514 }
3515 
3516 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3517 {
3518   PetscFunctionBegin;
3519   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3520   PetscFunctionReturn(PETSC_SUCCESS);
3521 }
3522 
3523 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3524 {
3525   PetscFunctionBegin;
3526   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3527   PetscFunctionReturn(PETSC_SUCCESS);
3528 }
3529 
3530 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3531 {
3532   PetscFunctionBegin;
3533   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3534   PetscFunctionReturn(PETSC_SUCCESS);
3535 }
3536 
3537 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538 {
3539   PetscFunctionBegin;
3540   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3541   PetscFunctionReturn(PETSC_SUCCESS);
3542 }
3543 
3544 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3545 {
3546   int i = blockIdx.x * blockDim.x + threadIdx.x;
3547   if (i < n) y[idx[i]] += x[i];
3548 }
3549 
3550 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3551 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3552 {
3553   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3554   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3555   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3556   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3557   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3558   PetscBool                     compressed;
3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560   PetscInt nx, ny;
3561 #endif
3562 
3563   PetscFunctionBegin;
3564   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3565   if (!a->nz) {
3566     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3567     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3568     PetscFunctionReturn(PETSC_SUCCESS);
3569   }
3570   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3571   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3572   if (!trans) {
3573     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3574     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3575   } else {
3576     if (herm || !A->form_explicit_transpose) {
3577       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3578       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579     } else {
3580       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3581       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3582     }
3583   }
3584   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3585   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3586 
3587   try {
3588     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3589     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3590     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3591 
3592     PetscCall(PetscLogGpuTimeBegin());
3593     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3594       /* z = A x + beta y.
3595          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3596          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3597       */
3598       xptr = xarray;
3599       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3600       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3603           allocated to accommodate different uses. So we get the length info directly from mat.
3604        */
3605       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3606         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3607         nx             = mat->num_cols; // since y = Ax
3608         ny             = mat->num_rows;
3609       }
3610 #endif
3611     } else {
3612       /* z = A^T x + beta y
3613          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3614          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3615        */
3616       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3617       dptr = zarray;
3618       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3619       if (compressed) { /* Scatter x to work vector */
3620         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3621 
3622         thrust::for_each(
3623 #if PetscDefined(HAVE_THRUST_ASYNC)
3624           thrust::cuda::par.on(PetscDefaultCudaStream),
3625 #endif
3626           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3627           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3628       }
3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632         nx             = mat->num_rows; // since y = A^T x
3633         ny             = mat->num_cols;
3634       }
3635 #endif
3636     }
3637 
3638     /* csr_spmv does y = alpha op(A) x + beta y */
3639     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3642       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3643   #else
3644       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3645   #endif
3646 
3647       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3648   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3649       if (!matDescr) {
3650         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3651         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3652       }
3653   #endif
3654 
3655       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3656         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3657         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3658         PetscCallCUSPARSE(
3659           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3660         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3661   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3662         PetscCallCUSPARSE(
3663           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664   #endif
3665         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3666       } else {
3667         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3668         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3669         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3670       }
3671 
3672       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3673 #else
3674       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3675       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3676 #endif
3677     } else {
3678       if (cusparsestruct->nrows) {
3679 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3680         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681 #else
3682         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3683         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3684 #endif
3685       }
3686     }
3687     PetscCall(PetscLogGpuTimeEnd());
3688 
3689     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3690       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3691         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3692           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3693         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3694           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3695         }
3696       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3697         PetscCall(VecSeq_CUDA::Set(zz, 0));
3698       }
3699 
3700       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3701       if (compressed) {
3702         PetscCall(PetscLogGpuTimeBegin());
3703         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3704         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3705         PetscCall(PetscLogGpuTimeEnd());
3706       }
3707     } else {
3708       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3709     }
3710     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3711     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3712     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3713   } catch (char *ex) {
3714     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3715   }
3716   if (yy) {
3717     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3718   } else {
3719     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3720   }
3721   PetscFunctionReturn(PETSC_SUCCESS);
3722 }
3723 
3724 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3725 {
3726   PetscFunctionBegin;
3727   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3728   PetscFunctionReturn(PETSC_SUCCESS);
3729 }
3730 
3731 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
3732 
3733 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3734 {
3735   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
3736 
3737   if (x < len) {
3738     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3739     PetscScalar    d = 0.0;
3740 
3741     for (PetscInt i = 0; i < num_non0_row; i++) {
3742       if (col[i + rowx] == x) {
3743         d = val[i + rowx];
3744         break;
3745       }
3746     }
3747     diag[x] = d;
3748   }
3749 }
3750 
3751 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3752 {
3753   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3754   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3755   PetscScalar                  *darray;
3756 
3757   PetscFunctionBegin;
3758   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3759     PetscInt   n   = A->rmap->n;
3760     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3761 
3762     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3763     if (n > 0) {
3764       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3765       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3766       PetscCallCUDA(cudaPeekAtLastError());
3767       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3768     }
3769   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3770   PetscFunctionReturn(PETSC_SUCCESS);
3771 }
3772 
3773 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3774 {
3775   PetscFunctionBegin;
3776   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3777   PetscFunctionReturn(PETSC_SUCCESS);
3778 }
3779 
3780 /*@
3781   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3782 
3783   Collective
3784 
3785   Input Parameters:
3786 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3787 . m    - number of rows
3788 . n    - number of columns
3789 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3790 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3791 
3792   Output Parameter:
3793 . A - the matrix
3794 
3795   Level: intermediate
3796 
3797   Notes:
3798   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3799   calculations. For good matrix assembly performance the user should preallocate the matrix
3800   storage by setting the parameter `nz` (or the array `nnz`).
3801 
3802   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3803   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3804   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3805 
3806   The AIJ format, also called
3807   compressed row storage, is fully compatible with standard Fortran
3808   storage.  That is, the stored row and column indices can begin at
3809   either one (as in Fortran) or zero.
3810 
3811   Specify the preallocated storage with either nz or nnz (not both).
3812   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3813   allocation.
3814 
3815   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3816 
3817 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3818           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3819 @*/
3820 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3821 {
3822   PetscFunctionBegin;
3823   PetscCall(MatCreate(comm, A));
3824   PetscCall(MatSetSizes(*A, m, n, m, n));
3825   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3826   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3827   PetscFunctionReturn(PETSC_SUCCESS);
3828 }
3829 
3830 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3831 {
3832   PetscFunctionBegin;
3833   if (A->factortype == MAT_FACTOR_NONE) {
3834     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3835   } else {
3836     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3837   }
3838   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3839   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3840   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3841   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3842   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3843   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3844   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3845   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3846   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3847   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3848   PetscCall(MatDestroy_SeqAIJ(A));
3849   PetscFunctionReturn(PETSC_SUCCESS);
3850 }
3851 
3852 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3853 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3854 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3855 {
3856   PetscFunctionBegin;
3857   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3858   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3859   PetscFunctionReturn(PETSC_SUCCESS);
3860 }
3861 
3862 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3863 {
3864   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3865   Mat_SeqAIJCUSPARSE *cy;
3866   Mat_SeqAIJCUSPARSE *cx;
3867   PetscScalar        *ay;
3868   const PetscScalar  *ax;
3869   CsrMatrix          *csry, *csrx;
3870 
3871   PetscFunctionBegin;
3872   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3873   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3874   if (X->ops->axpy != Y->ops->axpy) {
3875     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3876     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3877     PetscFunctionReturn(PETSC_SUCCESS);
3878   }
3879   /* if we are here, it means both matrices are bound to GPU */
3880   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3881   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3882   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3883   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3884   csry = (CsrMatrix *)cy->mat->mat;
3885   csrx = (CsrMatrix *)cx->mat->mat;
3886   /* see if we can turn this into a cublas axpy */
3887   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3888     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3889     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3890     if (eq) str = SAME_NONZERO_PATTERN;
3891   }
3892   /* spgeam is buggy with one column */
3893   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3894 
3895   if (str == SUBSET_NONZERO_PATTERN) {
3896     PetscScalar b = 1.0;
3897 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3898     size_t bufferSize;
3899     void  *buffer;
3900 #endif
3901 
3902     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3903     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3904     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3905 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3906     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3907                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3908     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3909     PetscCall(PetscLogGpuTimeBegin());
3910     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3911                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3912     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3913     PetscCall(PetscLogGpuTimeEnd());
3914     PetscCallCUDA(cudaFree(buffer));
3915 #else
3916     PetscCall(PetscLogGpuTimeBegin());
3917     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3918                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3919     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3920     PetscCall(PetscLogGpuTimeEnd());
3921 #endif
3922     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3923     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3924     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3925   } else if (str == SAME_NONZERO_PATTERN) {
3926     cublasHandle_t cublasv2handle;
3927     PetscBLASInt   one = 1, bnz = 1;
3928 
3929     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3930     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3931     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3932     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3933     PetscCall(PetscLogGpuTimeBegin());
3934     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3935     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3936     PetscCall(PetscLogGpuTimeEnd());
3937     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3938     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3939   } else {
3940     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3941     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3942   }
3943   PetscFunctionReturn(PETSC_SUCCESS);
3944 }
3945 
3946 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3947 {
3948   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3949   PetscScalar   *ay;
3950   cublasHandle_t cublasv2handle;
3951   PetscBLASInt   one = 1, bnz = 1;
3952 
3953   PetscFunctionBegin;
3954   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3955   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3956   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3957   PetscCall(PetscLogGpuTimeBegin());
3958   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3959   PetscCall(PetscLogGpuFlops(bnz));
3960   PetscCall(PetscLogGpuTimeEnd());
3961   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3962   PetscFunctionReturn(PETSC_SUCCESS);
3963 }
3964 
3965 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3966 {
3967   PetscBool   gpu = PETSC_FALSE;
3968   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;
3969 
3970   PetscFunctionBegin;
3971   if (A->factortype == MAT_FACTOR_NONE) {
3972     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3973     if (spptr->mat) {
3974       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3975       if (matrix->values) {
3976         gpu = PETSC_TRUE;
3977         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3978       }
3979     }
3980     if (spptr->matTranspose) {
3981       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3982       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3983     }
3984   }
3985   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3986   else {
3987     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3988     A->offloadmask = PETSC_OFFLOAD_CPU;
3989   }
3990   PetscFunctionReturn(PETSC_SUCCESS);
3991 }
3992 
3993 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3994 {
3995   PetscFunctionBegin;
3996   *m = PETSC_MEMTYPE_CUDA;
3997   PetscFunctionReturn(PETSC_SUCCESS);
3998 }
3999 
4000 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4001 {
4002   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4003 
4004   PetscFunctionBegin;
4005   if (A->factortype != MAT_FACTOR_NONE) {
4006     A->boundtocpu = flg;
4007     PetscFunctionReturn(PETSC_SUCCESS);
4008   }
4009   if (flg) {
4010     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4011 
4012     A->ops->scale                     = MatScale_SeqAIJ;
4013     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4014     A->ops->axpy                      = MatAXPY_SeqAIJ;
4015     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4016     A->ops->mult                      = MatMult_SeqAIJ;
4017     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4018     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4019     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4020     A->ops->multhermitiantranspose    = NULL;
4021     A->ops->multhermitiantransposeadd = NULL;
4022     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4023     A->ops->getcurrentmemtype         = NULL;
4024     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4025     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4026     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4027     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4028     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4029     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4030     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4031   } else {
4032     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4033     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4034     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4035     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4036     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4037     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4038     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4039     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4040     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4041     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4042     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4043     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4044     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4045     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4046     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4047     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4048     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4049     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4050     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4051 
4052     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4053     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4054     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4055     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4056     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4057     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058   }
4059   A->boundtocpu = flg;
4060   if (flg && a->inode.size_csr) {
4061     a->inode.use = PETSC_TRUE;
4062   } else {
4063     a->inode.use = PETSC_FALSE;
4064   }
4065   PetscFunctionReturn(PETSC_SUCCESS);
4066 }
4067 
4068 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4069 {
4070   Mat B;
4071 
4072   PetscFunctionBegin;
4073   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4074   if (reuse == MAT_INITIAL_MATRIX) {
4075     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4076   } else if (reuse == MAT_REUSE_MATRIX) {
4077     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4078   }
4079   B = *newmat;
4080 
4081   PetscCall(PetscFree(B->defaultvectype));
4082   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4083 
4084   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4085     if (B->factortype == MAT_FACTOR_NONE) {
4086       Mat_SeqAIJCUSPARSE *spptr;
4087       PetscCall(PetscNew(&spptr));
4088       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4089       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4090       spptr->format = MAT_CUSPARSE_CSR;
4091 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4092   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4093       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4094   #else
4095       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4096   #endif
4097       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4098       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4099 #endif
4100       B->spptr = spptr;
4101     } else {
4102       Mat_SeqAIJCUSPARSETriFactors *spptr;
4103 
4104       PetscCall(PetscNew(&spptr));
4105       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4106       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4107       B->spptr = spptr;
4108     }
4109     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4110   }
4111   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4112   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4113   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4114   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4115   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4116   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4117   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4118 
4119   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4120   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4121   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4122 #if defined(PETSC_HAVE_HYPRE)
4123   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4124 #endif
4125   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4126   PetscFunctionReturn(PETSC_SUCCESS);
4127 }
4128 
4129 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4130 {
4131   PetscFunctionBegin;
4132   PetscCall(MatCreate_SeqAIJ(B));
4133   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4134   PetscFunctionReturn(PETSC_SUCCESS);
4135 }
4136 
4137 /*MC
4138    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4139 
4140    Options Database Keys:
4141 +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4142 .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4143                                            Other options include ell (ellpack) or hyb (hybrid).
4144 .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4145 -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4146 
4147   Level: beginner
4148 
4149   Notes:
4150   These matrices can be in either CSR, ELL, or HYB format.
4151 
4152   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4153 
4154   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4155   if some integer values passed in do not fit in `int`.
4156 
4157 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4158 M*/
4159 
4160 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4161 {
4162   PetscFunctionBegin;
4163   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4164   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4165   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4166   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4167   PetscFunctionReturn(PETSC_SUCCESS);
4168 }
4169 
4170 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4171 {
4172   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4173 
4174   PetscFunctionBegin;
4175   if (cusp) {
4176     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4177     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4178     delete cusp->workVector;
4179     delete cusp->rowoffsets_gpu;
4180     delete cusp->csr2csc_i;
4181     delete cusp->coords;
4182     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4183     PetscCall(PetscFree(mat->spptr));
4184   }
4185   PetscFunctionReturn(PETSC_SUCCESS);
4186 }
4187 
4188 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4189 {
4190   PetscFunctionBegin;
4191   if (*mat) {
4192     delete (*mat)->values;
4193     delete (*mat)->column_indices;
4194     delete (*mat)->row_offsets;
4195     delete *mat;
4196     *mat = 0;
4197   }
4198   PetscFunctionReturn(PETSC_SUCCESS);
4199 }
4200 
4201 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4202 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4203 {
4204   PetscFunctionBegin;
4205   if (*trifactor) {
4206     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4207     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4208     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4209     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4210     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4211   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4212     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4213   #endif
4214     PetscCall(PetscFree(*trifactor));
4215   }
4216   PetscFunctionReturn(PETSC_SUCCESS);
4217 }
4218 #endif
4219 
4220 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4221 {
4222   CsrMatrix *mat;
4223 
4224   PetscFunctionBegin;
4225   if (*matstruct) {
4226     if ((*matstruct)->mat) {
4227       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4228 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4229         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4230 #else
4231         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4232         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4233 #endif
4234       } else {
4235         mat = (CsrMatrix *)(*matstruct)->mat;
4236         PetscCall(CsrMatrix_Destroy(&mat));
4237       }
4238     }
4239     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4240     delete (*matstruct)->cprowIndices;
4241     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4242     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4243     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4244 
4245 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4246     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4247     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4248 
4249     for (int i = 0; i < 3; i++) {
4250       if (mdata->cuSpMV[i].initialized) {
4251         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4252         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4253         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4254   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4255         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4256         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4257   #endif
4258       }
4259     }
4260 #endif
4261     delete *matstruct;
4262     *matstruct = NULL;
4263   }
4264   PetscFunctionReturn(PETSC_SUCCESS);
4265 }
4266 
4267 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4268 {
4269   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4270 
4271   PetscFunctionBegin;
4272   if (fs) {
4273 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4274     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4275     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4276     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4277     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4278     delete fs->workVector;
4279     fs->workVector = NULL;
4280 #endif
4281     delete fs->rpermIndices;
4282     delete fs->cpermIndices;
4283     fs->rpermIndices  = NULL;
4284     fs->cpermIndices  = NULL;
4285     fs->init_dev_prop = PETSC_FALSE;
4286 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4287     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4288     PetscCallCUDA(cudaFree(fs->csrColIdx));
4289     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4290     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4291     PetscCallCUDA(cudaFree(fs->csrVal));
4292     PetscCallCUDA(cudaFree(fs->diag));
4293     PetscCallCUDA(cudaFree(fs->X));
4294     PetscCallCUDA(cudaFree(fs->Y));
4295     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4296     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4297     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4298     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4299     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4300     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4301     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4302     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4303     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4304     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4305     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4306     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4307     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4308     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4309     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4310     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4311     PetscCall(PetscFree(fs->csrRowPtr_h));
4312     PetscCall(PetscFree(fs->csrVal_h));
4313     PetscCall(PetscFree(fs->diag_h));
4314     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4315     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4316 #endif
4317   }
4318   PetscFunctionReturn(PETSC_SUCCESS);
4319 }
4320 
4321 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4322 {
4323   PetscFunctionBegin;
4324   if (*trifactors) {
4325     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4326     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4327     PetscCall(PetscFree(*trifactors));
4328   }
4329   PetscFunctionReturn(PETSC_SUCCESS);
4330 }
4331 
4332 struct IJCompare {
4333   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4334   {
4335     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4336     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4337     return false;
4338   }
4339 };
4340 
4341 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4342 {
4343   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4344 
4345   PetscFunctionBegin;
4346   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4347   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4348   if (destroy) {
4349     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4350     delete cusp->csr2csc_i;
4351     cusp->csr2csc_i = NULL;
4352   }
4353   A->transupdated = PETSC_FALSE;
4354   PetscFunctionReturn(PETSC_SUCCESS);
4355 }
4356 
4357 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4358 {
4359   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4360 
4361   PetscFunctionBegin;
4362   PetscCallCUDA(cudaFree(coo->perm));
4363   PetscCallCUDA(cudaFree(coo->jmap));
4364   PetscCall(PetscFree(coo));
4365   PetscFunctionReturn(PETSC_SUCCESS);
4366 }
4367 
4368 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4369 {
4370   PetscBool            dev_ij = PETSC_FALSE;
4371   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4372   PetscInt            *i, *j;
4373   PetscContainer       container_h;
4374   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4375 
4376   PetscFunctionBegin;
4377   PetscCall(PetscGetMemType(coo_i, &mtype));
4378   if (PetscMemTypeDevice(mtype)) {
4379     dev_ij = PETSC_TRUE;
4380     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4381     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4382     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4383   } else {
4384     i = coo_i;
4385     j = coo_j;
4386   }
4387 
4388   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4389   if (dev_ij) PetscCall(PetscFree2(i, j));
4390   mat->offloadmask = PETSC_OFFLOAD_CPU;
4391   // Create the GPU memory
4392   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4393 
4394   // Copy the COO struct to device
4395   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4396   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4397   PetscCall(PetscMalloc1(1, &coo_d));
4398   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4399   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4400   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4401   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4402   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4403 
4404   // Put the COO struct in a container and then attach that to the matrix
4405   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4406   PetscFunctionReturn(PETSC_SUCCESS);
4407 }
4408 
4409 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4410 {
4411   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4412   const PetscCount grid_size = gridDim.x * blockDim.x;
4413   for (; i < nnz; i += grid_size) {
4414     PetscScalar sum = 0.0;
4415     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4416     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4417   }
4418 }
4419 
4420 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4421 {
4422   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4423   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4424   PetscCount           Annz = seq->nz;
4425   PetscMemType         memtype;
4426   const PetscScalar   *v1 = v;
4427   PetscScalar         *Aa;
4428   PetscContainer       container;
4429   MatCOOStruct_SeqAIJ *coo;
4430 
4431   PetscFunctionBegin;
4432   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4433 
4434   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4435   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4436 
4437   PetscCall(PetscGetMemType(v, &memtype));
4438   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4439     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4440     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4441   }
4442 
4443   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4444   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4445 
4446   PetscCall(PetscLogGpuTimeBegin());
4447   if (Annz) {
4448     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4449     PetscCallCUDA(cudaPeekAtLastError());
4450   }
4451   PetscCall(PetscLogGpuTimeEnd());
4452 
4453   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4454   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4455 
4456   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4457   PetscFunctionReturn(PETSC_SUCCESS);
4458 }
4459 
4460 /*@C
4461   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4462 
4463   Not Collective
4464 
4465   Input Parameters:
4466 + A          - the matrix
4467 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4468 
4469   Output Parameters:
4470 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4471 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4472 
4473   Level: developer
4474 
4475   Note:
4476   When compressed is true, the CSR structure does not contain empty rows
4477 
4478 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4479 @*/
4480 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4481 {
4482   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4483   CsrMatrix          *csr;
4484   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4485 
4486   PetscFunctionBegin;
4487   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4488   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4489   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4490   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4491   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4492   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4493   csr = (CsrMatrix *)cusp->mat->mat;
4494   if (i) {
4495     if (!compressed && a->compressedrow.use) { /* need full row offset */
4496       if (!cusp->rowoffsets_gpu) {
4497         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4498         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4499         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4500       }
4501       *i = cusp->rowoffsets_gpu->data().get();
4502     } else *i = csr->row_offsets->data().get();
4503   }
4504   if (j) *j = csr->column_indices->data().get();
4505   PetscFunctionReturn(PETSC_SUCCESS);
4506 }
4507 
4508 /*@C
4509   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4510 
4511   Not Collective
4512 
4513   Input Parameters:
4514 + A          - the matrix
4515 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4516 . i          - the CSR row pointers
4517 - j          - the CSR column indices
4518 
4519   Level: developer
4520 
4521 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4522 @*/
4523 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4524 {
4525   PetscFunctionBegin;
4526   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4527   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4528   if (i) *i = NULL;
4529   if (j) *j = NULL;
4530   (void)compressed;
4531   PetscFunctionReturn(PETSC_SUCCESS);
4532 }
4533 
4534 /*@C
4535   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4536 
4537   Not Collective
4538 
4539   Input Parameter:
4540 . A - a `MATSEQAIJCUSPARSE` matrix
4541 
4542   Output Parameter:
4543 . a - pointer to the device data
4544 
4545   Level: developer
4546 
4547   Note:
4548   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4549 
4550 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4551 @*/
4552 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4553 {
4554   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4555   CsrMatrix          *csr;
4556 
4557   PetscFunctionBegin;
4558   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4559   PetscAssertPointer(a, 2);
4560   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4561   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4562   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4563   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4564   csr = (CsrMatrix *)cusp->mat->mat;
4565   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4566   *a = csr->values->data().get();
4567   PetscFunctionReturn(PETSC_SUCCESS);
4568 }
4569 
4570 /*@C
4571   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4572 
4573   Not Collective
4574 
4575   Input Parameters:
4576 + A - a `MATSEQAIJCUSPARSE` matrix
4577 - a - pointer to the device data
4578 
4579   Level: developer
4580 
4581 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4582 @*/
4583 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4584 {
4585   PetscFunctionBegin;
4586   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4587   PetscAssertPointer(a, 2);
4588   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4589   *a = NULL;
4590   PetscFunctionReturn(PETSC_SUCCESS);
4591 }
4592 
4593 /*@C
4594   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4595 
4596   Not Collective
4597 
4598   Input Parameter:
4599 . A - a `MATSEQAIJCUSPARSE` matrix
4600 
4601   Output Parameter:
4602 . a - pointer to the device data
4603 
4604   Level: developer
4605 
4606   Note:
4607   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4608 
4609 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4610 @*/
4611 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4612 {
4613   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4614   CsrMatrix          *csr;
4615 
4616   PetscFunctionBegin;
4617   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4618   PetscAssertPointer(a, 2);
4619   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4620   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4621   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4622   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4623   csr = (CsrMatrix *)cusp->mat->mat;
4624   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4625   *a             = csr->values->data().get();
4626   A->offloadmask = PETSC_OFFLOAD_GPU;
4627   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4628   PetscFunctionReturn(PETSC_SUCCESS);
4629 }
4630 /*@C
4631   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4632 
4633   Not Collective
4634 
4635   Input Parameters:
4636 + A - a `MATSEQAIJCUSPARSE` matrix
4637 - a - pointer to the device data
4638 
4639   Level: developer
4640 
4641 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4642 @*/
4643 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4644 {
4645   PetscFunctionBegin;
4646   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4647   PetscAssertPointer(a, 2);
4648   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4649   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4650   *a = NULL;
4651   PetscFunctionReturn(PETSC_SUCCESS);
4652 }
4653 
4654 /*@C
4655   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4656 
4657   Not Collective
4658 
4659   Input Parameter:
4660 . A - a `MATSEQAIJCUSPARSE` matrix
4661 
4662   Output Parameter:
4663 . a - pointer to the device data
4664 
4665   Level: developer
4666 
4667   Note:
4668   Does not trigger any host to device copies.
4669 
4670   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4671 
4672 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4673 @*/
4674 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4675 {
4676   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4677   CsrMatrix          *csr;
4678 
4679   PetscFunctionBegin;
4680   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4681   PetscAssertPointer(a, 2);
4682   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4683   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4684   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4685   csr = (CsrMatrix *)cusp->mat->mat;
4686   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4687   *a             = csr->values->data().get();
4688   A->offloadmask = PETSC_OFFLOAD_GPU;
4689   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4690   PetscFunctionReturn(PETSC_SUCCESS);
4691 }
4692 
4693 /*@C
4694   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4695 
4696   Not Collective
4697 
4698   Input Parameters:
4699 + A - a `MATSEQAIJCUSPARSE` matrix
4700 - a - pointer to the device data
4701 
4702   Level: developer
4703 
4704 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4705 @*/
4706 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4707 {
4708   PetscFunctionBegin;
4709   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4710   PetscAssertPointer(a, 2);
4711   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4712   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4713   *a = NULL;
4714   PetscFunctionReturn(PETSC_SUCCESS);
4715 }
4716 
4717 struct IJCompare4 {
4718   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4719   {
4720     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4721     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4722     return false;
4723   }
4724 };
4725 
4726 struct Shift {
4727   int _shift;
4728 
4729   Shift(int shift) : _shift(shift) { }
4730   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4731 };
4732 
4733 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4734 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4735 {
4736   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4737   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4738   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4739   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4740   PetscInt                      Annz, Bnnz;
4741   cusparseStatus_t              stat;
4742   PetscInt                      i, m, n, zero = 0;
4743 
4744   PetscFunctionBegin;
4745   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4746   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4747   PetscAssertPointer(C, 4);
4748   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4749   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4750   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4751   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4752   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4753   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4754   if (reuse == MAT_INITIAL_MATRIX) {
4755     m = A->rmap->n;
4756     n = A->cmap->n + B->cmap->n;
4757     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4758     PetscCall(MatSetSizes(*C, m, n, m, n));
4759     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4760     c                       = (Mat_SeqAIJ *)(*C)->data;
4761     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4762     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4763     Ccsr                    = new CsrMatrix;
4764     Cmat->cprowIndices      = NULL;
4765     c->compressedrow.use    = PETSC_FALSE;
4766     c->compressedrow.nrows  = 0;
4767     c->compressedrow.i      = NULL;
4768     c->compressedrow.rindex = NULL;
4769     Ccusp->workVector       = NULL;
4770     Ccusp->nrows            = m;
4771     Ccusp->mat              = Cmat;
4772     Ccusp->mat->mat         = Ccsr;
4773     Ccsr->num_rows          = m;
4774     Ccsr->num_cols          = n;
4775     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4776     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4777     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4778     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4779     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4780     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4781     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4782     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4783     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4784     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4785     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4786     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4787     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4788 
4789     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4790     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4791     Annz                 = (PetscInt)Acsr->column_indices->size();
4792     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4793     c->nz                = Annz + Bnnz;
4794     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4795     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4796     Ccsr->values         = new THRUSTARRAY(c->nz);
4797     Ccsr->num_entries    = c->nz;
4798     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4799     if (c->nz) {
4800       auto              Acoo = new THRUSTINTARRAY32(Annz);
4801       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4802       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4803       THRUSTINTARRAY32 *Aroff, *Broff;
4804 
4805       if (a->compressedrow.use) { /* need full row offset */
4806         if (!Acusp->rowoffsets_gpu) {
4807           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4808           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4809           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4810         }
4811         Aroff = Acusp->rowoffsets_gpu;
4812       } else Aroff = Acsr->row_offsets;
4813       if (b->compressedrow.use) { /* need full row offset */
4814         if (!Bcusp->rowoffsets_gpu) {
4815           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4816           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4817           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4818         }
4819         Broff = Bcusp->rowoffsets_gpu;
4820       } else Broff = Bcsr->row_offsets;
4821       PetscCall(PetscLogGpuTimeBegin());
4822       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4823       PetscCallCUSPARSE(stat);
4824       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4825       PetscCallCUSPARSE(stat);
4826       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4827       auto Aperm = thrust::make_constant_iterator(1);
4828       auto Bperm = thrust::make_constant_iterator(0);
4829 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4830       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4831       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4832 #else
4833       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4834       auto Bcib = Bcsr->column_indices->begin();
4835       auto Bcie = Bcsr->column_indices->end();
4836       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4837 #endif
4838       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4839       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4840       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4841       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4842       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4843       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4844       auto p1    = Ccusp->coords->begin();
4845       auto p2    = Ccusp->coords->begin();
4846       thrust::advance(p2, Annz);
4847       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4848 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4849       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4850 #endif
4851       auto cci = thrust::make_counting_iterator(zero);
4852       auto cce = thrust::make_counting_iterator(c->nz);
4853 #if 0 //Errors on SUMMIT cuda 11.1.0
4854       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4855 #else
4856   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4857       auto pred = thrust::identity<int>();
4858   #else
4859       auto pred = cuda::std::identity();
4860   #endif
4861       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4862       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4863 #endif
4864       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4865       PetscCallCUSPARSE(stat);
4866       PetscCall(PetscLogGpuTimeEnd());
4867       delete wPerm;
4868       delete Acoo;
4869       delete Bcoo;
4870       delete Ccoo;
4871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4872       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4873       PetscCallCUSPARSE(stat);
4874 #endif
4875       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4876         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4877         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4878         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4879         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4880         CsrMatrix                    *CcsrT = new CsrMatrix;
4881         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4882         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4883 
4884         (*C)->form_explicit_transpose = PETSC_TRUE;
4885         (*C)->transupdated            = PETSC_TRUE;
4886         Ccusp->rowoffsets_gpu         = NULL;
4887         CmatT->cprowIndices           = NULL;
4888         CmatT->mat                    = CcsrT;
4889         CcsrT->num_rows               = n;
4890         CcsrT->num_cols               = m;
4891         CcsrT->num_entries            = c->nz;
4892 
4893         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4894         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4895         CcsrT->values         = new THRUSTARRAY(c->nz);
4896 
4897         PetscCall(PetscLogGpuTimeBegin());
4898         auto rT = CcsrT->row_offsets->begin();
4899         if (AT) {
4900           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4901           thrust::advance(rT, -1);
4902         }
4903         if (BT) {
4904           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4905           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4906           thrust::copy(titb, tite, rT);
4907         }
4908         auto cT = CcsrT->column_indices->begin();
4909         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4910         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4911         auto vT = CcsrT->values->begin();
4912         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4913         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4914         PetscCall(PetscLogGpuTimeEnd());
4915 
4916         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4917         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4918         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4919         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4920         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4921         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4922         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4923         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4924         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4925 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4926         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4927         PetscCallCUSPARSE(stat);
4928 #endif
4929         Ccusp->matTranspose = CmatT;
4930       }
4931     }
4932 
4933     c->free_a = PETSC_TRUE;
4934     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4935     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4936     c->free_ij = PETSC_TRUE;
4937     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4938       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4939       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4940       ii = *Ccsr->row_offsets;
4941       jj = *Ccsr->column_indices;
4942       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4943       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4944     } else {
4945       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4946       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4947     }
4948     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4949     PetscCall(PetscMalloc1(m, &c->ilen));
4950     PetscCall(PetscMalloc1(m, &c->imax));
4951     c->maxnz         = c->nz;
4952     c->nonzerorowcnt = 0;
4953     c->rmax          = 0;
4954     for (i = 0; i < m; i++) {
4955       const PetscInt nn = c->i[i + 1] - c->i[i];
4956       c->ilen[i] = c->imax[i] = nn;
4957       c->nonzerorowcnt += (PetscInt)!!nn;
4958       c->rmax = PetscMax(c->rmax, nn);
4959     }
4960     PetscCall(PetscMalloc1(c->nz, &c->a));
4961     (*C)->nonzerostate++;
4962     PetscCall(PetscLayoutSetUp((*C)->rmap));
4963     PetscCall(PetscLayoutSetUp((*C)->cmap));
4964     Ccusp->nonzerostate = (*C)->nonzerostate;
4965     (*C)->preallocated  = PETSC_TRUE;
4966   } else {
4967     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4968     c = (Mat_SeqAIJ *)(*C)->data;
4969     if (c->nz) {
4970       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4971       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4972       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4973       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4974       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4975       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4976       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4977       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4978       Acsr = (CsrMatrix *)Acusp->mat->mat;
4979       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4980       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4981       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4982       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4983       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4984       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4985       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4986       auto pmid = Ccusp->coords->begin();
4987       thrust::advance(pmid, Acsr->num_entries);
4988       PetscCall(PetscLogGpuTimeBegin());
4989       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4990       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4991       thrust::for_each(zibait, zieait, VecCUDAEquals());
4992       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4993       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4994       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4995       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4996       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4997         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4998         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4999         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5000         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5001         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5002         auto       vT    = CcsrT->values->begin();
5003         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5004         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5005         (*C)->transupdated = PETSC_TRUE;
5006       }
5007       PetscCall(PetscLogGpuTimeEnd());
5008     }
5009   }
5010   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5011   (*C)->assembled     = PETSC_TRUE;
5012   (*C)->was_assembled = PETSC_FALSE;
5013   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5014   PetscFunctionReturn(PETSC_SUCCESS);
5015 }
5016 
5017 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5018 {
5019   bool               dmem;
5020   const PetscScalar *av;
5021 
5022   PetscFunctionBegin;
5023   dmem = isCudaMem(v);
5024   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5025   if (n && idx) {
5026     THRUSTINTARRAY widx(n);
5027     widx.assign(idx, idx + n);
5028     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5029 
5030     THRUSTARRAY                    *w = NULL;
5031     thrust::device_ptr<PetscScalar> dv;
5032     if (dmem) {
5033       dv = thrust::device_pointer_cast(v);
5034     } else {
5035       w  = new THRUSTARRAY(n);
5036       dv = w->data();
5037     }
5038     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5039 
5040     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5041     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5042     thrust::for_each(zibit, zieit, VecCUDAEquals());
5043     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5044     delete w;
5045   } else {
5046     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5047   }
5048   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5049   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5050   PetscFunctionReturn(PETSC_SUCCESS);
5051 }
5052