xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision fd5a855563b981e3d8ad570cd3f080c8adb7dcc2)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24   #include <cuda/std/functional>
25 #endif
26 
27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29 /*
30   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32 */
33 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36 #endif
37 
38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48 #endif
49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
59 
60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
64 
65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
67 
68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71 
72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
75 
76   PetscFunctionBegin;
77   switch (op) {
78   case MAT_CUSPARSE_MULT:
79     cusparsestruct->format = format;
80     break;
81   case MAT_CUSPARSE_ALL:
82     cusparsestruct->format = format;
83     break;
84   default:
85     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86   }
87   PetscFunctionReturn(PETSC_SUCCESS);
88 }
89 
90 /*@
91   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92   operation. Only the `MatMult()` operation can use different GPU storage formats
93 
94   Not Collective
95 
96   Input Parameters:
97 + A      - Matrix of type `MATSEQAIJCUSPARSE`
98 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101 
102   Level: intermediate
103 
104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105 @*/
106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107 {
108   PetscFunctionBegin;
109   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117 
118   PetscFunctionBegin;
119   cusparsestruct->use_cpu_solve = use_cpu;
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /*@
124   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125 
126   Input Parameters:
127 + A       - Matrix of type `MATSEQAIJCUSPARSE`
128 - use_cpu - set flag for using the built-in CPU `MatSolve()`
129 
130   Level: intermediate
131 
132   Note:
133   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136 
137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138 @*/
139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140 {
141   PetscFunctionBegin;
142   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148 {
149   PetscFunctionBegin;
150   switch (op) {
151   case MAT_FORM_EXPLICIT_TRANSPOSE:
152     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154     A->form_explicit_transpose = flg;
155     break;
156   default:
157     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158     break;
159   }
160   PetscFunctionReturn(PETSC_SUCCESS);
161 }
162 
163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164 {
165   MatCUSPARSEStorageFormat format;
166   PetscBool                flg;
167   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
168 
169   PetscFunctionBegin;
170   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171   if (A->factortype == MAT_FACTOR_NONE) {
172     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174 
175     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184   #else
185     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186   #endif
187     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189 
190     PetscCall(
191       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193 #endif
194   }
195   PetscOptionsHeadEnd();
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201 {
202   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203   PetscInt                      m  = A->rmap->n;
204   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
206   const MatScalar              *Aa = a->a;
207   PetscInt                     *Mi, *Mj, Mnz;
208   PetscScalar                  *Ma;
209 
210   PetscFunctionBegin;
211   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
212   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
213     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
214       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
215       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
216       PetscCall(PetscMalloc1(m + 1, &Mi));
217       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
218       PetscCall(PetscMalloc1(Mnz, &Ma));
219       Mi[0] = 0;
220       for (PetscInt i = 0; i < m; i++) {
221         PetscInt llen = Ai[i + 1] - Ai[i];
222         PetscInt ulen = adiag[i] - adiag[i + 1];
223         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
224         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
225         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
226         Mi[i + 1] = Mi[i] + llen + ulen;
227       }
228       // Copy M (L,U) from host to device
229       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
230       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
231       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
232       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
233       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
234 
235       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
236       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
237       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
238       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
239       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
240       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
241       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
242       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
243 
244       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
245       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
246       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
247 
248       fillMode = CUSPARSE_FILL_MODE_UPPER;
249       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
250       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
251       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
252       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
253 
254       // Allocate work vectors in SpSv
255       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
256       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
257 
258       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
259       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
260 
261       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
262       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
263       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
264       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
265       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
266       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
267       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
268 
269       // Record for reuse
270       fs->csrRowPtr_h = Mi;
271       fs->csrVal_h    = Ma;
272       PetscCall(PetscFree(Mj));
273     }
274     // Copy the value
275     Mi  = fs->csrRowPtr_h;
276     Ma  = fs->csrVal_h;
277     Mnz = Mi[m];
278     for (PetscInt i = 0; i < m; i++) {
279       PetscInt llen = Ai[i + 1] - Ai[i];
280       PetscInt ulen = adiag[i] - adiag[i + 1];
281       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
282       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
283       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
284     }
285     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
286 
287   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
288     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
289       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
290       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292     } else
293   #endif
294     {
295       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
296       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
297 
298       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
299       fs->updatedSpSVAnalysis          = PETSC_TRUE;
300       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
301     }
302   }
303   PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 #else
306 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
307 {
308   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
309   PetscInt                           n                  = A->rmap->n;
310   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
311   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
312   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
313   const MatScalar                   *aa = a->a, *v;
314   PetscInt                          *AiLo, *AjLo;
315   PetscInt                           i, nz, nzLower, offset, rowOffset;
316 
317   PetscFunctionBegin;
318   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
319   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
320     try {
321       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
322       nzLower = n + ai[n] - ai[1];
323       if (!loTriFactor) {
324         PetscScalar *AALo;
325 
326         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
327 
328         /* Allocate Space for the lower triangular matrix */
329         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
330         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
331 
332         /* Fill the lower triangular matrix */
333         AiLo[0]   = (PetscInt)0;
334         AiLo[n]   = nzLower;
335         AjLo[0]   = (PetscInt)0;
336         AALo[0]   = (MatScalar)1.0;
337         v         = aa;
338         vi        = aj;
339         offset    = 1;
340         rowOffset = 1;
341         for (i = 1; i < n; i++) {
342           nz = ai[i + 1] - ai[i];
343           /* additional 1 for the term on the diagonal */
344           AiLo[i] = rowOffset;
345           rowOffset += nz + 1;
346 
347           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
348           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
349 
350           offset += nz;
351           AjLo[offset] = (PetscInt)i;
352           AALo[offset] = (MatScalar)1.0;
353           offset += 1;
354 
355           v += nz;
356           vi += nz;
357         }
358 
359         /* allocate space for the triangular factor information */
360         PetscCall(PetscNew(&loTriFactor));
361         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
362         /* Create the matrix description */
363         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
364         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
365   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
366         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
367   #else
368         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
369   #endif
370         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
371         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
372 
373         /* set the operation */
374         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
375 
376         /* set the matrix */
377         loTriFactor->csrMat              = new CsrMatrix;
378         loTriFactor->csrMat->num_rows    = n;
379         loTriFactor->csrMat->num_cols    = n;
380         loTriFactor->csrMat->num_entries = nzLower;
381 
382         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
383         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
384 
385         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
386         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
387 
388         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
389         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
390 
391         /* Create the solve analysis information */
392         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
393         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
394   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
395         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
396                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
397         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
398   #endif
399 
400         /* perform the solve analysis */
401         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
402                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
403         PetscCallCUDA(WaitForCUDA());
404         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
405 
406         /* assign the pointer */
407         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
408         loTriFactor->AA_h                                          = AALo;
409         PetscCallCUDA(cudaFreeHost(AiLo));
410         PetscCallCUDA(cudaFreeHost(AjLo));
411         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
412       } else { /* update values only */
413         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
414         /* Fill the lower triangular matrix */
415         loTriFactor->AA_h[0] = 1.0;
416         v                    = aa;
417         vi                   = aj;
418         offset               = 1;
419         for (i = 1; i < n; i++) {
420           nz = ai[i + 1] - ai[i];
421           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
422           offset += nz;
423           loTriFactor->AA_h[offset] = 1.0;
424           offset += 1;
425           v += nz;
426         }
427         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
428         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
429       }
430     } catch (char *ex) {
431       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
438 {
439   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
440   PetscInt                           n                  = A->rmap->n;
441   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
442   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
443   const PetscInt                    *aj                 = a->j, *adiag, *vi;
444   const MatScalar                   *aa                 = a->a, *v;
445   PetscInt                          *AiUp, *AjUp;
446   PetscInt                           i, nz, nzUpper, offset;
447 
448   PetscFunctionBegin;
449   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
450   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
451   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
452     try {
453       /* next, figure out the number of nonzeros in the upper triangular matrix. */
454       nzUpper = adiag[0] - adiag[n];
455       if (!upTriFactor) {
456         PetscScalar *AAUp;
457 
458         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
459 
460         /* Allocate Space for the upper triangular matrix */
461         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
462         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
463 
464         /* Fill the upper triangular matrix */
465         AiUp[0] = (PetscInt)0;
466         AiUp[n] = nzUpper;
467         offset  = nzUpper;
468         for (i = n - 1; i >= 0; i--) {
469           v  = aa + adiag[i + 1] + 1;
470           vi = aj + adiag[i + 1] + 1;
471 
472           /* number of elements NOT on the diagonal */
473           nz = adiag[i] - adiag[i + 1] - 1;
474 
475           /* decrement the offset */
476           offset -= (nz + 1);
477 
478           /* first, set the diagonal elements */
479           AjUp[offset] = (PetscInt)i;
480           AAUp[offset] = (MatScalar)1. / v[nz];
481           AiUp[i]      = AiUp[i + 1] - (nz + 1);
482 
483           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
484           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
485         }
486 
487         /* allocate space for the triangular factor information */
488         PetscCall(PetscNew(&upTriFactor));
489         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
490 
491         /* Create the matrix description */
492         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
493         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
494   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
495         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
496   #else
497         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
498   #endif
499         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
500         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
501 
502         /* set the operation */
503         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
504 
505         /* set the matrix */
506         upTriFactor->csrMat              = new CsrMatrix;
507         upTriFactor->csrMat->num_rows    = n;
508         upTriFactor->csrMat->num_cols    = n;
509         upTriFactor->csrMat->num_entries = nzUpper;
510 
511         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
512         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
513 
514         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
515         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
516 
517         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
518         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
519 
520         /* Create the solve analysis information */
521         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
522         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
523   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
524         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
525                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
526         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
527   #endif
528 
529         /* perform the solve analysis */
530         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
531                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
532 
533         PetscCallCUDA(WaitForCUDA());
534         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
535 
536         /* assign the pointer */
537         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
538         upTriFactor->AA_h                                          = AAUp;
539         PetscCallCUDA(cudaFreeHost(AiUp));
540         PetscCallCUDA(cudaFreeHost(AjUp));
541         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
542       } else {
543         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
544         /* Fill the upper triangular matrix */
545         offset = nzUpper;
546         for (i = n - 1; i >= 0; i--) {
547           v = aa + adiag[i + 1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i + 1] - 1;
551 
552           /* decrement the offset */
553           offset -= (nz + 1);
554 
555           /* first, set the diagonal elements */
556           upTriFactor->AA_h[offset] = 1. / v[nz];
557           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
558         }
559         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
560         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
561       }
562     } catch (char *ex) {
563       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
564     }
565   }
566   PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 #endif
569 
570 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
571 {
572   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
573   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
574   IS                            isrow = a->row, isicol = a->icol;
575   PetscBool                     row_identity, col_identity;
576   PetscInt                      n = A->rmap->n;
577 
578   PetscFunctionBegin;
579   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
580 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
581   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
582 #else
583   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
584   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
585   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
586 #endif
587 
588   cusparseTriFactors->nnz = a->nz;
589 
590   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
591   /* lower triangular indices */
592   PetscCall(ISIdentity(isrow, &row_identity));
593   if (!row_identity && !cusparseTriFactors->rpermIndices) {
594     const PetscInt *r;
595 
596     PetscCall(ISGetIndices(isrow, &r));
597     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
598     cusparseTriFactors->rpermIndices->assign(r, r + n);
599     PetscCall(ISRestoreIndices(isrow, &r));
600     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
601   }
602 
603   /* upper triangular indices */
604   PetscCall(ISIdentity(isicol, &col_identity));
605   if (!col_identity && !cusparseTriFactors->cpermIndices) {
606     const PetscInt *c;
607 
608     PetscCall(ISGetIndices(isicol, &c));
609     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
610     cusparseTriFactors->cpermIndices->assign(c, c + n);
611     PetscCall(ISRestoreIndices(isicol, &c));
612     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
613   }
614   PetscFunctionReturn(PETSC_SUCCESS);
615 }
616 
617 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
618 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
619 {
620   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
621   PetscInt                      m  = A->rmap->n;
622   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
623   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
624   const MatScalar              *Aa = a->a;
625   PetscInt                     *Mj, Mnz;
626   PetscScalar                  *Ma, *D;
627 
628   PetscFunctionBegin;
629   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
630   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
631     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
632       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
633       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
634       Mnz = Ai[m]; // Unz (with the unit diagonal)
635       PetscCall(PetscMalloc1(Mnz, &Ma));
636       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
637       PetscCall(PetscMalloc1(m, &D));    // the diagonal
638       for (PetscInt i = 0; i < m; i++) {
639         PetscInt ulen = Ai[i + 1] - Ai[i];
640         Mj[Ai[i]]     = i;                                              // diagonal entry
641         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
642       }
643       // Copy M (U) from host to device
644       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
645       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
646       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
647       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
648       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
649       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
650 
651       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
652       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
653       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
654       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
655       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
656       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
657       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
658       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
659 
660       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
661       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
662       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
663 
664       // Allocate work vectors in SpSv
665       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
666       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
667 
668       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
669       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
670 
671       // Query buffer sizes for SpSV and then allocate buffers
672       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
673       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
674       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
675 
676       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
677       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
678       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
679 
680       // Record for reuse
681       fs->csrVal_h = Ma;
682       fs->diag_h   = D;
683       PetscCall(PetscFree(Mj));
684     }
685     // Copy the value
686     Ma  = fs->csrVal_h;
687     D   = fs->diag_h;
688     Mnz = Ai[m];
689     for (PetscInt i = 0; i < m; i++) {
690       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
691       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
692       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
693     }
694     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
695     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
696 
697   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
698     if (fs->updatedSpSVAnalysis) {
699       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
700       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701     } else
702   #endif
703     {
704       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
705       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
706       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
707       fs->updatedSpSVAnalysis = PETSC_TRUE;
708     }
709   }
710   PetscFunctionReturn(PETSC_SUCCESS);
711 }
712 
713 // Solve Ut D U x = b
714 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
715 {
716   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
717   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
718   const PetscScalar                    *barray;
719   PetscScalar                          *xarray;
720   thrust::device_ptr<const PetscScalar> bGPU;
721   thrust::device_ptr<PetscScalar>       xGPU;
722   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
723   PetscInt                              m   = A->rmap->n;
724 
725   PetscFunctionBegin;
726   PetscCall(PetscLogGpuTimeBegin());
727   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
728   PetscCall(VecCUDAGetArrayRead(b, &barray));
729   xGPU = thrust::device_pointer_cast(xarray);
730   bGPU = thrust::device_pointer_cast(barray);
731 
732   // Reorder b with the row permutation if needed, and wrap the result in fs->X
733   if (fs->rpermIndices) {
734     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
735     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
736   } else {
737     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
738   }
739 
740   // Solve Ut Y = X
741   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
742   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
743 
744   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
745   // It is basically a vector element-wise multiplication, but cublas does not have it!
746   #if CCCL_VERSION >= 3001000
747   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
748   #else
749   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
750   #endif
751 
752   // Solve U X = Y
753   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755   } else {
756     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757   }
758   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759 
760   // Reorder X with the column permutation if needed, and put the result back to x
761   if (fs->cpermIndices) {
762     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764   }
765 
766   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768   PetscCall(PetscLogGpuTimeEnd());
769   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779   PetscInt                          *AiUp, *AjUp;
780   PetscScalar                       *AAUp;
781   PetscScalar                       *AALo;
782   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785   const MatScalar                   *aa = b->a, *v;
786 
787   PetscFunctionBegin;
788   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790     try {
791       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793       if (!upTriFactor && !loTriFactor) {
794         /* Allocate Space for the upper triangular matrix */
795         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797 
798         /* Fill the upper triangular matrix */
799         AiUp[0] = (PetscInt)0;
800         AiUp[n] = nzUpper;
801         offset  = 0;
802         for (i = 0; i < n; i++) {
803           /* set the pointers */
804           v  = aa + ai[i];
805           vj = aj + ai[i];
806           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807 
808           /* first, set the diagonal elements */
809           AjUp[offset] = (PetscInt)i;
810           AAUp[offset] = (MatScalar)1.0 / v[nz];
811           AiUp[i]      = offset;
812           AALo[offset] = (MatScalar)1.0 / v[nz];
813 
814           offset += 1;
815           if (nz > 0) {
816             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
817             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
818             for (j = offset; j < offset + nz; j++) {
819               AAUp[j] = -AAUp[j];
820               AALo[j] = AAUp[j] / v[nz];
821             }
822             offset += nz;
823           }
824         }
825 
826         /* allocate space for the triangular factor information */
827         PetscCall(PetscNew(&upTriFactor));
828         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829 
830         /* Create the matrix description */
831         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835   #else
836         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837   #endif
838         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840 
841         /* set the matrix */
842         upTriFactor->csrMat              = new CsrMatrix;
843         upTriFactor->csrMat->num_rows    = A->rmap->n;
844         upTriFactor->csrMat->num_cols    = A->cmap->n;
845         upTriFactor->csrMat->num_entries = a->nz;
846 
847         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849 
850         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852 
853         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855 
856         /* set the operation */
857         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858 
859         /* Create the solve analysis information */
860         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866   #endif
867 
868         /* perform the solve analysis */
869         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871 
872         PetscCallCUDA(WaitForCUDA());
873         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874 
875         /* assign the pointer */
876         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877 
878         /* allocate space for the triangular factor information */
879         PetscCall(PetscNew(&loTriFactor));
880         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881 
882         /* Create the matrix description */
883         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887   #else
888         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889   #endif
890         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892 
893         /* set the operation */
894         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895 
896         /* set the matrix */
897         loTriFactor->csrMat              = new CsrMatrix;
898         loTriFactor->csrMat->num_rows    = A->rmap->n;
899         loTriFactor->csrMat->num_cols    = A->cmap->n;
900         loTriFactor->csrMat->num_entries = a->nz;
901 
902         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904 
905         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907 
908         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910 
911         /* Create the solve analysis information */
912         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918   #endif
919 
920         /* perform the solve analysis */
921         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923 
924         PetscCallCUDA(WaitForCUDA());
925         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926 
927         /* assign the pointer */
928         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929 
930         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931         PetscCallCUDA(cudaFreeHost(AiUp));
932         PetscCallCUDA(cudaFreeHost(AjUp));
933       } else {
934         /* Fill the upper triangular matrix */
935         offset = 0;
936         for (i = 0; i < n; i++) {
937           /* set the pointers */
938           v  = aa + ai[i];
939           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940 
941           /* first, set the diagonal elements */
942           AAUp[offset] = 1.0 / v[nz];
943           AALo[offset] = 1.0 / v[nz];
944 
945           offset += 1;
946           if (nz > 0) {
947             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
948             for (j = offset; j < offset + nz; j++) {
949               AAUp[j] = -AAUp[j];
950               AALo[j] = AAUp[j] / v[nz];
951             }
952             offset += nz;
953           }
954         }
955         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960       }
961       PetscCallCUDA(cudaFreeHost(AAUp));
962       PetscCallCUDA(cudaFreeHost(AALo));
963     } catch (char *ex) {
964       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965     }
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970 
971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975   IS                            ip                 = a->row;
976   PetscBool                     perm_identity;
977   PetscInt                      n = A->rmap->n;
978 
979   PetscFunctionBegin;
980   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981 
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
984 #else
985   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989 
990   A->offloadmask = PETSC_OFFLOAD_BOTH;
991 
992   /* lower triangular indices */
993   PetscCall(ISIdentity(ip, &perm_identity));
994   if (!perm_identity) {
995     IS              iip;
996     const PetscInt *irip, *rip;
997 
998     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999     PetscCall(ISGetIndices(iip, &irip));
1000     PetscCall(ISGetIndices(ip, &rip));
1001     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005     PetscCall(ISRestoreIndices(iip, &irip));
1006     PetscCall(ISDestroy(&iip));
1007     PetscCall(ISRestoreIndices(ip, &rip));
1008     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015   PetscFunctionBegin;
1016   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019 
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024   /* determine which version of MatSolve needs to be used. */
1025   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026   IS          ip = b->row;
1027   PetscBool   perm_identity;
1028 
1029   PetscCall(ISIdentity(ip, &perm_identity));
1030   if (perm_identity) {
1031     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033   } else {
1034     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036   }
1037 #endif
1038   B->ops->matsolve          = NULL;
1039   B->ops->matsolvetranspose = NULL;
1040 
1041   /* get the triangular factors */
1042   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054   cusparseIndexBase_t                indexBase;
1055   cusparseMatrixType_t               matrixType;
1056   cusparseFillMode_t                 fillMode;
1057   cusparseDiagType_t                 diagType;
1058 
1059   PetscFunctionBegin;
1060   /* allocate space for the transpose of the lower triangular factor */
1061   PetscCall(PetscNew(&loTriFactorT));
1062   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063 
1064   /* set the matrix descriptors of the lower triangular factor */
1065   matrixType = cusparseGetMatType(loTriFactor->descr);
1066   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069 
1070   /* Create the matrix description */
1071   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076 
1077   /* set the operation */
1078   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079 
1080   /* allocate GPU space for the CSC of the lower triangular factor*/
1081   loTriFactorT->csrMat                 = new CsrMatrix;
1082   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088 
1089   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095   #endif
1096 
1097   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098   {
1099     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104   #else
1105                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106   #endif
1107     PetscCallCUSPARSE(stat);
1108   }
1109 
1110   PetscCallCUDA(WaitForCUDA());
1111   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112 
1113   /* Create the solve analysis information */
1114   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120   #endif
1121 
1122   /* perform the solve analysis */
1123   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125 
1126   PetscCallCUDA(WaitForCUDA());
1127   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128 
1129   /* assign the pointer */
1130   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131 
1132   /*********************************************/
1133   /* Now the Transpose of the Upper Tri Factor */
1134   /*********************************************/
1135 
1136   /* allocate space for the transpose of the upper triangular factor */
1137   PetscCall(PetscNew(&upTriFactorT));
1138   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139 
1140   /* set the matrix descriptors of the upper triangular factor */
1141   matrixType = cusparseGetMatType(upTriFactor->descr);
1142   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145 
1146   /* Create the matrix description */
1147   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152 
1153   /* set the operation */
1154   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155 
1156   /* allocate GPU space for the CSC of the upper triangular factor*/
1157   upTriFactorT->csrMat                 = new CsrMatrix;
1158   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164 
1165   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171   #endif
1172 
1173   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   {
1175     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180   #else
1181                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182   #endif
1183     PetscCallCUSPARSE(stat);
1184   }
1185 
1186   PetscCallCUDA(WaitForCUDA());
1187   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188 
1189   /* Create the solve analysis information */
1190   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196   #endif
1197 
1198   /* perform the solve analysis */
1199   /* christ, would it have killed you to put this stuff in a function????????? */
1200   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202 
1203   PetscCallCUDA(WaitForCUDA());
1204   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205 
1206   /* assign the pointer */
1207   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211 
1212 struct PetscScalarToPetscInt {
1213   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221   cusparseStatus_t              stat;
1222   cusparseIndexBase_t           indexBase;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232   PetscCall(PetscLogGpuTimeBegin());
1233   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240 
1241     /* set alpha and beta */
1242     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1243     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1244     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1245     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248 
1249     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250       CsrMatrix *matrixT      = new CsrMatrix;
1251       matstructT->mat         = matrixT;
1252       matrixT->num_rows       = A->cmap->n;
1253       matrixT->num_cols       = A->rmap->n;
1254       matrixT->num_entries    = a->nz;
1255       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257       matrixT->values         = new THRUSTARRAY(a->nz);
1258 
1259       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261 
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase, cusparse_scalartype);
1266       PetscCallCUSPARSE(stat);
1267   #else
1268       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270 
1271            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274         */
1275       if (matrixT->num_entries) {
1276         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277         PetscCallCUSPARSE(stat);
1278 
1279       } else {
1280         matstructT->matDescr = NULL;
1281         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282       }
1283   #endif
1284 #endif
1285     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289       CsrMatrix *temp  = new CsrMatrix;
1290       CsrMatrix *tempT = new CsrMatrix;
1291       /* First convert HYB to CSR */
1292       temp->num_rows       = A->rmap->n;
1293       temp->num_cols       = A->cmap->n;
1294       temp->num_entries    = a->nz;
1295       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297       temp->values         = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300       PetscCallCUSPARSE(stat);
1301 
1302       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303       tempT->num_rows       = A->rmap->n;
1304       tempT->num_cols       = A->cmap->n;
1305       tempT->num_entries    = a->nz;
1306       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308       tempT->values         = new THRUSTARRAY(a->nz);
1309 
1310       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312       PetscCallCUSPARSE(stat);
1313 
1314       /* Last, convert CSC to HYB */
1315       cusparseHybMat_t hybMat;
1316       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319       PetscCallCUSPARSE(stat);
1320 
1321       /* assign the pointer */
1322       matstructT->mat = hybMat;
1323       A->transupdated = PETSC_TRUE;
1324       /* delete temporaries */
1325       if (tempT) {
1326         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329         delete (CsrMatrix *)tempT;
1330       }
1331       if (temp) {
1332         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335         delete (CsrMatrix *)temp;
1336       }
1337 #endif
1338     }
1339   }
1340   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355     }
1356     if (!cusparsestruct->csr2csc_i) {
1357       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359 
1360       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362       void  *csr2cscBuffer;
1363       size_t csr2cscBufferSize;
1364       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366       PetscCallCUSPARSE(stat);
1367       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381         PetscCallCUSPARSE(stat);
1382 #else
1383                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384         PetscCallCUSPARSE(stat);
1385 #endif
1386       } else {
1387         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388       }
1389 
1390       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395     }
1396     PetscCallThrust(
1397       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398   }
1399   PetscCall(PetscLogGpuTimeEnd());
1400   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401   /* the compressed row indices is not used for matTranspose */
1402   matstructT->cprowIndices = NULL;
1403   /* assign the pointer */
1404   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405   A->transupdated                                = PETSC_TRUE;
1406   PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408 
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412   const PetscScalar                    *barray;
1413   PetscScalar                          *xarray;
1414   thrust::device_ptr<const PetscScalar> bGPU;
1415   thrust::device_ptr<PetscScalar>       xGPU;
1416   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420   PetscInt                              m   = A->rmap->n;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscLogGpuTimeBegin());
1424   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426   xGPU = thrust::device_pointer_cast(xarray);
1427   bGPU = thrust::device_pointer_cast(barray);
1428 
1429   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430   if (fs->rpermIndices) {
1431     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433   } else {
1434     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435   }
1436 
1437   // Solve L Y = X
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441 
1442   // Solve U X = Y
1443   if (fs->cpermIndices) {
1444     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445   } else {
1446     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447   }
1448   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449 
1450   // Reorder X with the column permutation if needed, and put the result back to x
1451   if (fs->cpermIndices) {
1452     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454   }
1455   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466   const PetscScalar                    *barray;
1467   PetscScalar                          *xarray;
1468   thrust::device_ptr<const PetscScalar> bGPU;
1469   thrust::device_ptr<PetscScalar>       xGPU;
1470   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472   PetscInt                              m   = A->rmap->n;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscLogGpuTimeBegin());
1476   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480 
1481     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486   }
1487 
1488   if (!fs->updatedTransposeSpSVAnalysis) {
1489     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490 
1491     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493   }
1494 
1495   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497   xGPU = thrust::device_pointer_cast(xarray);
1498   bGPU = thrust::device_pointer_cast(barray);
1499 
1500   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501   if (fs->rpermIndices) {
1502     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504   } else {
1505     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506   }
1507 
1508   // Solve Ut Y = X
1509   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511 
1512   // Solve Lt X = Y
1513   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515   } else {
1516     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517   }
1518   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519 
1520   // Reorder X with the column permutation if needed, and put the result back to x
1521   if (fs->cpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524   }
1525 
1526   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528   PetscCall(PetscLogGpuTimeEnd());
1529   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536   PetscInt                              n = xx->map->n;
1537   const PetscScalar                    *barray;
1538   PetscScalar                          *xarray;
1539   thrust::device_ptr<const PetscScalar> bGPU;
1540   thrust::device_ptr<PetscScalar>       xGPU;
1541   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545 
1546   PetscFunctionBegin;
1547   /* Analyze the matrix and create the transpose ... on the fly */
1548   if (!loTriFactorT && !upTriFactorT) {
1549     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552   }
1553 
1554   /* Get the GPU pointers */
1555   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557   xGPU = thrust::device_pointer_cast(xarray);
1558   bGPU = thrust::device_pointer_cast(barray);
1559 
1560   PetscCall(PetscLogGpuTimeBegin());
1561   /* First, reorder with the row permutation */
1562   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563 
1564   /* First, solve U */
1565   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567 
1568   /* Then, solve L */
1569   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571 
1572   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606 
1607   PetscCall(PetscLogGpuTimeBegin());
1608   /* First, solve U */
1609   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611 
1612   /* Then, solve L */
1613   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626   const PetscScalar                    *barray;
1627   PetscScalar                          *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634 
1635   PetscFunctionBegin;
1636   /* Get the GPU pointers */
1637   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639   xGPU = thrust::device_pointer_cast(xarray);
1640   bGPU = thrust::device_pointer_cast(barray);
1641 
1642   PetscCall(PetscLogGpuTimeBegin());
1643   /* First, reorder with the row permutation */
1644   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645 
1646   /* Next, solve L */
1647   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649 
1650   /* Then, solve U */
1651   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653 
1654   /* Last, reorder with the column permutation */
1655   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656 
1657   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666   const PetscScalar                 *barray;
1667   PetscScalar                       *xarray;
1668   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672 
1673   PetscFunctionBegin;
1674   /* Get the GPU pointers */
1675   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677 
1678   PetscCall(PetscLogGpuTimeBegin());
1679   /* First, solve L */
1680   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682 
1683   /* Next, solve U */
1684   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686 
1687   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689   PetscCall(PetscLogGpuTimeEnd());
1690   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694 
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701   CsrMatrix                    *Acsr;
1702   PetscInt                      m, nz;
1703   PetscBool                     flg;
1704 
1705   PetscFunctionBegin;
1706   if (PetscDefined(USE_DEBUG)) {
1707     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709   }
1710 
1711   /* Copy A's value to fact */
1712   m  = fact->rmap->n;
1713   nz = aij->nz;
1714   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717 
1718   PetscCall(PetscLogGpuTimeBegin());
1719   /* Factorize fact inplace */
1720   if (m)
1721     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1722                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1723   if (PetscDefined(USE_DEBUG)) {
1724     int              numerical_zero;
1725     cusparseStatus_t status;
1726     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1727     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1728   }
1729 
1730   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1731   if (fs->updatedSpSVAnalysis) {
1732     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1733     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1734   } else
1735   #endif
1736   {
1737     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1738      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1739     */
1740     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1741 
1742     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1743 
1744     fs->updatedSpSVAnalysis = PETSC_TRUE;
1745     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1746     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1747   }
1748 
1749   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1750   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1751   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1752   fact->ops->matsolve          = NULL;
1753   fact->ops->matsolvetranspose = NULL;
1754   PetscCall(PetscLogGpuTimeEnd());
1755   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1756   PetscFunctionReturn(PETSC_SUCCESS);
1757 }
1758 
1759 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1760 {
1761   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1762   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1763   PetscInt                      m, nz;
1764 
1765   PetscFunctionBegin;
1766   if (PetscDefined(USE_DEBUG)) {
1767     PetscBool flg, diagDense;
1768 
1769     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1770     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1771     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1772     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1773     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1774   }
1775 
1776   /* Free the old stale stuff */
1777   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1778 
1779   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1780      but they will not be used. Allocate them just for easy debugging.
1781    */
1782   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1783 
1784   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1785   fact->factortype             = MAT_FACTOR_ILU;
1786   fact->info.factor_mallocs    = 0;
1787   fact->info.fill_ratio_given  = info->fill;
1788   fact->info.fill_ratio_needed = 1.0;
1789 
1790   aij->row = NULL;
1791   aij->col = NULL;
1792 
1793   /* ====================================================================== */
1794   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1795   /* We'll do in-place factorization on fact                                */
1796   /* ====================================================================== */
1797   const int *Ai, *Aj;
1798 
1799   m  = fact->rmap->n;
1800   nz = aij->nz;
1801 
1802   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1803   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1804   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1805   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1806   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1807   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1808 
1809   /* ====================================================================== */
1810   /* Create descriptors for M, L, U                                         */
1811   /* ====================================================================== */
1812   cusparseFillMode_t fillMode;
1813   cusparseDiagType_t diagType;
1814 
1815   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1816   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1817   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1818 
1819   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1820     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1821     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1822     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1823     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1824   */
1825   fillMode = CUSPARSE_FILL_MODE_LOWER;
1826   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1827   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1828   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1829   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1830 
1831   fillMode = CUSPARSE_FILL_MODE_UPPER;
1832   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1833   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1834   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1835   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1836 
1837   /* ========================================================================= */
1838   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1839   /* ========================================================================= */
1840   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1841   if (m)
1842     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1843                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1844 
1845   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1846   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1847 
1848   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1849   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1850 
1851   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1852   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1853 
1854   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1855   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1856 
1857   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1858      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1859      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1860      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1861    */
1862   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1863     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1864     fs->spsvBuffer_L = fs->factBuffer_M;
1865     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1866   } else {
1867     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1868     fs->spsvBuffer_U = fs->factBuffer_M;
1869     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1870   }
1871 
1872   /* ========================================================================== */
1873   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1874   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1875   /* ========================================================================== */
1876   int              structural_zero;
1877   cusparseStatus_t status;
1878 
1879   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1880   if (m)
1881     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1882                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1883   if (PetscDefined(USE_DEBUG)) {
1884     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1885     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1886     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1887   }
1888 
1889   /* Estimate FLOPs of the numeric factorization */
1890   {
1891     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1892     PetscInt       *Ai, nzRow, nzLeft;
1893     const PetscInt *adiag;
1894     PetscLogDouble  flops = 0.0;
1895 
1896     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1897     Ai = Aseq->i;
1898     for (PetscInt i = 0; i < m; i++) {
1899       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1900         nzRow  = Ai[i + 1] - Ai[i];
1901         nzLeft = adiag[i] - Ai[i];
1902         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1903           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1904         */
1905         nzLeft = (nzRow - 1) / 2;
1906         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1907       }
1908     }
1909     fs->numericFactFlops = flops;
1910   }
1911   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1912   PetscFunctionReturn(PETSC_SUCCESS);
1913 }
1914 
1915 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1916 {
1917   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1918   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1919   const PetscScalar            *barray;
1920   PetscScalar                  *xarray;
1921 
1922   PetscFunctionBegin;
1923   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1924   PetscCall(VecCUDAGetArrayRead(b, &barray));
1925   PetscCall(PetscLogGpuTimeBegin());
1926 
1927   /* Solve L*y = b */
1928   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1929   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1930   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1931                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1932 
1933   /* Solve Lt*x = y */
1934   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1935   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1936                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1937 
1938   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1939   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1940 
1941   PetscCall(PetscLogGpuTimeEnd());
1942   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1943   PetscFunctionReturn(PETSC_SUCCESS);
1944 }
1945 
1946 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1947 {
1948   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1949   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1950   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1951   CsrMatrix                    *Acsr;
1952   PetscInt                      m, nz;
1953   PetscBool                     flg;
1954 
1955   PetscFunctionBegin;
1956   if (PetscDefined(USE_DEBUG)) {
1957     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1958     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1959   }
1960 
1961   /* Copy A's value to fact */
1962   m  = fact->rmap->n;
1963   nz = aij->nz;
1964   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1965   Acsr = (CsrMatrix *)Acusp->mat->mat;
1966   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1967 
1968   /* Factorize fact inplace */
1969   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1970      csric02() only takes the lower triangular part of matrix A to perform factorization.
1971      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1972      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1973      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1974    */
1975   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1976   if (PetscDefined(USE_DEBUG)) {
1977     int              numerical_zero;
1978     cusparseStatus_t status;
1979     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1980     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1981   }
1982 
1983   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1984   if (fs->updatedSpSVAnalysis) {
1985     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1986     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1987   } else
1988   #endif
1989   {
1990     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1991 
1992     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1993     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1994   */
1995     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1996     fs->updatedSpSVAnalysis = PETSC_TRUE;
1997   }
1998 
1999   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2000   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2001   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2002   fact->ops->matsolve          = NULL;
2003   fact->ops->matsolvetranspose = NULL;
2004   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2005   PetscFunctionReturn(PETSC_SUCCESS);
2006 }
2007 
2008 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2009 {
2010   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2011   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2012   PetscInt                      m, nz;
2013 
2014   PetscFunctionBegin;
2015   if (PetscDefined(USE_DEBUG)) {
2016     PetscBool flg, diagDense;
2017 
2018     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2019     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2020     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2021     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2022     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2023   }
2024 
2025   /* Free the old stale stuff */
2026   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2027 
2028   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2029      but they will not be used. Allocate them just for easy debugging.
2030    */
2031   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2032 
2033   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2034   fact->factortype             = MAT_FACTOR_ICC;
2035   fact->info.factor_mallocs    = 0;
2036   fact->info.fill_ratio_given  = info->fill;
2037   fact->info.fill_ratio_needed = 1.0;
2038 
2039   aij->row = NULL;
2040   aij->col = NULL;
2041 
2042   /* ====================================================================== */
2043   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2044   /* We'll do in-place factorization on fact                                */
2045   /* ====================================================================== */
2046   const int *Ai, *Aj;
2047 
2048   m  = fact->rmap->n;
2049   nz = aij->nz;
2050 
2051   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2052   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2053   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2054   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2055   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2056   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2057 
2058   /* ====================================================================== */
2059   /* Create mat descriptors for M, L                                        */
2060   /* ====================================================================== */
2061   cusparseFillMode_t fillMode;
2062   cusparseDiagType_t diagType;
2063 
2064   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2065   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2066   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2067 
2068   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2069     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2070     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2071     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2072     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2073   */
2074   fillMode = CUSPARSE_FILL_MODE_LOWER;
2075   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2076   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2077   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2078   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2079 
2080   /* ========================================================================= */
2081   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2082   /* ========================================================================= */
2083   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2084   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2085 
2086   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2087   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2088 
2089   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2090   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2091 
2092   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2093   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2094 
2095   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2096   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2097 
2098   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2099      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2100    */
2101   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2102     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2103     fs->spsvBuffer_L = fs->factBuffer_M;
2104     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2105   } else {
2106     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2107     fs->spsvBuffer_Lt = fs->factBuffer_M;
2108     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2109   }
2110 
2111   /* ========================================================================== */
2112   /* Perform analysis of ic0 on M                                               */
2113   /* The lower triangular part of M has the same sparsity pattern as L          */
2114   /* ========================================================================== */
2115   int              structural_zero;
2116   cusparseStatus_t status;
2117 
2118   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2119   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2120   if (PetscDefined(USE_DEBUG)) {
2121     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2122     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2123     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2124   }
2125 
2126   /* Estimate FLOPs of the numeric factorization */
2127   {
2128     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2129     PetscInt      *Ai, nzRow, nzLeft;
2130     PetscLogDouble flops = 0.0;
2131 
2132     Ai = Aseq->i;
2133     for (PetscInt i = 0; i < m; i++) {
2134       nzRow = Ai[i + 1] - Ai[i];
2135       if (nzRow > 1) {
2136         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2137           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2138         */
2139         nzLeft = (nzRow - 1) / 2;
2140         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2141       }
2142     }
2143     fs->numericFactFlops = flops;
2144   }
2145   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2146   PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 #endif
2149 
2150 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2151 {
2152   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2153   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2154 
2155   PetscFunctionBegin;
2156   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2157   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2158   B->offloadmask = PETSC_OFFLOAD_CPU;
2159 
2160   if (!cusparsestruct->use_cpu_solve) {
2161 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2162     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2163     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2164 #else
2165     /* determine which version of MatSolve needs to be used. */
2166     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2167     IS          isrow = b->row, iscol = b->col;
2168     PetscBool   row_identity, col_identity;
2169 
2170     PetscCall(ISIdentity(isrow, &row_identity));
2171     PetscCall(ISIdentity(iscol, &col_identity));
2172     if (row_identity && col_identity) {
2173       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2174       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2175     } else {
2176       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2177       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2178     }
2179 #endif
2180   }
2181   B->ops->matsolve          = NULL;
2182   B->ops->matsolvetranspose = NULL;
2183 
2184   /* get the triangular factors */
2185   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2186   PetscFunctionReturn(PETSC_SUCCESS);
2187 }
2188 
2189 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2190 {
2191   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2192 
2193   PetscFunctionBegin;
2194   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2195   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2196   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2197   PetscFunctionReturn(PETSC_SUCCESS);
2198 }
2199 
2200 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2201 {
2202   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2203 
2204   PetscFunctionBegin;
2205 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2206   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2207   if (!info->factoronhost) {
2208     PetscCall(ISIdentity(isrow, &row_identity));
2209     PetscCall(ISIdentity(iscol, &col_identity));
2210   }
2211   if (!info->levels && row_identity && col_identity) {
2212     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2213   } else
2214 #endif
2215   {
2216     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219   }
2220   PetscFunctionReturn(PETSC_SUCCESS);
2221 }
2222 
2223 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2224 {
2225   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2226 
2227   PetscFunctionBegin;
2228 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2229   PetscBool perm_identity = PETSC_FALSE;
2230   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2231   if (!info->levels && perm_identity) {
2232     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2233   } else
2234 #endif
2235   {
2236     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2237     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2238     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2239   }
2240   PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242 
2243 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2244 {
2245   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2246 
2247   PetscFunctionBegin;
2248   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2249   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2250   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2251   PetscFunctionReturn(PETSC_SUCCESS);
2252 }
2253 
2254 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2255 {
2256   PetscFunctionBegin;
2257   *type = MATSOLVERCUSPARSE;
2258   PetscFunctionReturn(PETSC_SUCCESS);
2259 }
2260 
2261 /*MC
2262   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2263   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2264   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2265   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2266   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2267   algorithms are not recommended. This class does NOT support direct solver operations.
2268 
2269   Level: beginner
2270 
2271 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2272           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2273 M*/
2274 
2275 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2276 {
2277   PetscInt n = A->rmap->n;
2278 
2279   PetscFunctionBegin;
2280   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2281   PetscCall(MatSetSizes(*B, n, n, n, n));
2282   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2283   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2284 
2285   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2286   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2287     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2288     if (!A->boundtocpu) {
2289       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2290       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2291     } else {
2292       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2293       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2294     }
2295     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2296     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2297     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2298   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2299     if (!A->boundtocpu) {
2300       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2301       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2302     } else {
2303       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2304       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2305     }
2306     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2307     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2308   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2309 
2310   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2311   (*B)->canuseordering = PETSC_TRUE;
2312   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2313   PetscFunctionReturn(PETSC_SUCCESS);
2314 }
2315 
2316 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2317 {
2318   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2319   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2320 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2321   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2322 #endif
2323 
2324   PetscFunctionBegin;
2325   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2326     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2327     if (A->factortype == MAT_FACTOR_NONE) {
2328       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2329       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2330     }
2331 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2332     else if (fs->csrVal) {
2333       /* We have a factorized matrix on device and are able to copy it to host */
2334       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2335     }
2336 #endif
2337     else
2338       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2339     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2340     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2341     A->offloadmask = PETSC_OFFLOAD_BOTH;
2342   }
2343   PetscFunctionReturn(PETSC_SUCCESS);
2344 }
2345 
2346 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2347 {
2348   PetscFunctionBegin;
2349   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2350   *array = ((Mat_SeqAIJ *)A->data)->a;
2351   PetscFunctionReturn(PETSC_SUCCESS);
2352 }
2353 
2354 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2355 {
2356   PetscFunctionBegin;
2357   A->offloadmask = PETSC_OFFLOAD_CPU;
2358   *array         = NULL;
2359   PetscFunctionReturn(PETSC_SUCCESS);
2360 }
2361 
2362 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2363 {
2364   PetscFunctionBegin;
2365   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2366   *array = ((Mat_SeqAIJ *)A->data)->a;
2367   PetscFunctionReturn(PETSC_SUCCESS);
2368 }
2369 
2370 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2371 {
2372   PetscFunctionBegin;
2373   *array = NULL;
2374   PetscFunctionReturn(PETSC_SUCCESS);
2375 }
2376 
2377 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2378 {
2379   PetscFunctionBegin;
2380   *array = ((Mat_SeqAIJ *)A->data)->a;
2381   PetscFunctionReturn(PETSC_SUCCESS);
2382 }
2383 
2384 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2385 {
2386   PetscFunctionBegin;
2387   A->offloadmask = PETSC_OFFLOAD_CPU;
2388   *array         = NULL;
2389   PetscFunctionReturn(PETSC_SUCCESS);
2390 }
2391 
2392 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2393 {
2394   Mat_SeqAIJCUSPARSE *cusp;
2395   CsrMatrix          *matrix;
2396 
2397   PetscFunctionBegin;
2398   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2399   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2400   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2401   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2402   matrix = (CsrMatrix *)cusp->mat->mat;
2403 
2404   if (i) {
2405 #if !defined(PETSC_USE_64BIT_INDICES)
2406     *i = matrix->row_offsets->data().get();
2407 #else
2408     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2409 #endif
2410   }
2411   if (j) {
2412 #if !defined(PETSC_USE_64BIT_INDICES)
2413     *j = matrix->column_indices->data().get();
2414 #else
2415     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2416 #endif
2417   }
2418   if (a) *a = matrix->values->data().get();
2419   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2420   PetscFunctionReturn(PETSC_SUCCESS);
2421 }
2422 
2423 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2424 {
2425   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2426   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2427   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2428   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2429   cusparseStatus_t              stat;
2430   PetscBool                     both = PETSC_TRUE;
2431 
2432   PetscFunctionBegin;
2433   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2434   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2435     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2436       CsrMatrix *matrix;
2437       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2438 
2439       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2440       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2441       matrix->values->assign(a->a, a->a + a->nz);
2442       PetscCallCUDA(WaitForCUDA());
2443       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2444       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2445       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2446     } else {
2447       PetscInt nnz;
2448       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2449       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2450       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2451       delete cusparsestruct->workVector;
2452       delete cusparsestruct->rowoffsets_gpu;
2453       cusparsestruct->workVector     = NULL;
2454       cusparsestruct->rowoffsets_gpu = NULL;
2455       try {
2456         if (a->compressedrow.use) {
2457           m    = a->compressedrow.nrows;
2458           ii   = a->compressedrow.i;
2459           ridx = a->compressedrow.rindex;
2460         } else {
2461           m    = A->rmap->n;
2462           ii   = a->i;
2463           ridx = NULL;
2464         }
2465         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2466         if (!a->a) {
2467           nnz  = ii[m];
2468           both = PETSC_FALSE;
2469         } else nnz = a->nz;
2470         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2471 
2472         /* create cusparse matrix */
2473         cusparsestruct->nrows = m;
2474         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2475         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2476         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2477         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2478 
2479         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2480         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2481         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2482         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2483         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2484         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2485         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2486 
2487         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2488         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2489           /* set the matrix */
2490           CsrMatrix *mat   = new CsrMatrix;
2491           mat->num_rows    = m;
2492           mat->num_cols    = A->cmap->n;
2493           mat->num_entries = nnz;
2494           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2495           mat->row_offsets->assign(ii, ii + m + 1);
2496           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2497           mat->column_indices->assign(a->j, a->j + nnz);
2498 
2499           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2500           if (a->a) mat->values->assign(a->a, a->a + nnz);
2501 
2502           /* assign the pointer */
2503           matstruct->mat = mat;
2504 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2505           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2506             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2507                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2508             PetscCallCUSPARSE(stat);
2509           }
2510 #endif
2511         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2512 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2513           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2514 #else
2515           CsrMatrix *mat   = new CsrMatrix;
2516           mat->num_rows    = m;
2517           mat->num_cols    = A->cmap->n;
2518           mat->num_entries = nnz;
2519           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2520           mat->row_offsets->assign(ii, ii + m + 1);
2521 
2522           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2523           mat->column_indices->assign(a->j, a->j + nnz);
2524 
2525           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2526           if (a->a) mat->values->assign(a->a, a->a + nnz);
2527 
2528           cusparseHybMat_t hybMat;
2529           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2530           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2531           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2532           PetscCallCUSPARSE(stat);
2533           /* assign the pointer */
2534           matstruct->mat = hybMat;
2535 
2536           if (mat) {
2537             if (mat->values) delete (THRUSTARRAY *)mat->values;
2538             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2539             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2540             delete (CsrMatrix *)mat;
2541           }
2542 #endif
2543         }
2544 
2545         /* assign the compressed row indices */
2546         if (a->compressedrow.use) {
2547           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2548           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2549           matstruct->cprowIndices->assign(ridx, ridx + m);
2550           tmp = m;
2551         } else {
2552           cusparsestruct->workVector = NULL;
2553           matstruct->cprowIndices    = NULL;
2554           tmp                        = 0;
2555         }
2556         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2557 
2558         /* assign the pointer */
2559         cusparsestruct->mat = matstruct;
2560       } catch (char *ex) {
2561         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2562       }
2563       PetscCallCUDA(WaitForCUDA());
2564       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2565       cusparsestruct->nonzerostate = A->nonzerostate;
2566     }
2567     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2568   }
2569   PetscFunctionReturn(PETSC_SUCCESS);
2570 }
2571 
2572 struct VecCUDAPlusEquals {
2573   template <typename Tuple>
2574   __host__ __device__ void operator()(Tuple t)
2575   {
2576     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2577   }
2578 };
2579 
2580 struct VecCUDAEquals {
2581   template <typename Tuple>
2582   __host__ __device__ void operator()(Tuple t)
2583   {
2584     thrust::get<1>(t) = thrust::get<0>(t);
2585   }
2586 };
2587 
2588 struct VecCUDAEqualsReverse {
2589   template <typename Tuple>
2590   __host__ __device__ void operator()(Tuple t)
2591   {
2592     thrust::get<0>(t) = thrust::get<1>(t);
2593   }
2594 };
2595 
2596 struct MatProductCtx_MatMatCusparse {
2597   PetscBool      cisdense;
2598   PetscScalar   *Bt;
2599   Mat            X;
2600   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2601   PetscLogDouble flops;
2602   CsrMatrix     *Bcsr;
2603 
2604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2605   cusparseSpMatDescr_t matSpBDescr;
2606   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2607   cusparseDnMatDescr_t matBDescr;
2608   cusparseDnMatDescr_t matCDescr;
2609   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2610   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2611   void *dBuffer4;
2612   void *dBuffer5;
2613   #endif
2614   size_t                mmBufferSize;
2615   void                 *mmBuffer;
2616   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2617   cusparseSpGEMMDescr_t spgemmDesc;
2618 #endif
2619 };
2620 
2621 static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
2622 {
2623   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2624 
2625   PetscFunctionBegin;
2626   PetscCallCUDA(cudaFree(mmdata->Bt));
2627   delete mmdata->Bcsr;
2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2629   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2630   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2631   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2632   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2633   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2635   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2636   #endif
2637   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2638   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2639 #endif
2640   PetscCall(MatDestroy(&mmdata->X));
2641   PetscCall(PetscFree(mmdata));
2642   PetscFunctionReturn(PETSC_SUCCESS);
2643 }
2644 
2645 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2646 
2647 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2648 {
2649   Mat_Product                  *product = C->product;
2650   Mat                           A, B;
2651   PetscInt                      m, n, blda, clda;
2652   PetscBool                     flg, biscuda;
2653   Mat_SeqAIJCUSPARSE           *cusp;
2654   cusparseStatus_t              stat;
2655   cusparseOperation_t           opA;
2656   const PetscScalar            *barray;
2657   PetscScalar                  *carray;
2658   MatProductCtx_MatMatCusparse *mmdata;
2659   Mat_SeqAIJCUSPARSEMultStruct *mat;
2660   CsrMatrix                    *csrmat;
2661 
2662   PetscFunctionBegin;
2663   MatCheckProduct(C, 1);
2664   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2665   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2666   A      = product->A;
2667   B      = product->B;
2668   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2669   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2670   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2671      Instead of silently accepting the wrong answer, I prefer to raise the error */
2672   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2673   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2674   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2675   switch (product->type) {
2676   case MATPRODUCT_AB:
2677   case MATPRODUCT_PtAP:
2678     mat = cusp->mat;
2679     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2680     m   = A->rmap->n;
2681     n   = B->cmap->n;
2682     break;
2683   case MATPRODUCT_AtB:
2684     if (!A->form_explicit_transpose) {
2685       mat = cusp->mat;
2686       opA = CUSPARSE_OPERATION_TRANSPOSE;
2687     } else {
2688       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2689       mat = cusp->matTranspose;
2690       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2691     }
2692     m = A->cmap->n;
2693     n = B->cmap->n;
2694     break;
2695   case MATPRODUCT_ABt:
2696   case MATPRODUCT_RARt:
2697     mat = cusp->mat;
2698     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2699     m   = A->rmap->n;
2700     n   = B->rmap->n;
2701     break;
2702   default:
2703     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2704   }
2705   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2706   csrmat = (CsrMatrix *)mat->mat;
2707   /* if the user passed a CPU matrix, copy the data to the GPU */
2708   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2709   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2710   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2711 
2712   PetscCall(MatDenseGetLDA(B, &blda));
2713   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2714     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2715     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2716   } else {
2717     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2718     PetscCall(MatDenseGetLDA(C, &clda));
2719   }
2720 
2721   PetscCall(PetscLogGpuTimeBegin());
2722 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2723   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2724   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2725   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2726   #else
2727   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2728   #endif
2729 
2730   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2731   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2732     size_t mmBufferSize;
2733     if (mmdata->initialized && mmdata->Blda != blda) {
2734       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2735       mmdata->matBDescr = NULL;
2736     }
2737     if (!mmdata->matBDescr) {
2738       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739       mmdata->Blda = blda;
2740     }
2741 
2742     if (mmdata->initialized && mmdata->Clda != clda) {
2743       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2744       mmdata->matCDescr = NULL;
2745     }
2746     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2747       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2748       mmdata->Clda = clda;
2749     }
2750 
2751   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2752     if (matADescr) {
2753       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2754       matADescr = NULL;
2755     }
2756   #endif
2757 
2758     if (!matADescr) {
2759       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2760                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2761       PetscCallCUSPARSE(stat);
2762     }
2763 
2764     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2765 
2766     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2767       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2768       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2769       mmdata->mmBufferSize = mmBufferSize;
2770     }
2771 
2772   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2773     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2774   #endif
2775 
2776     mmdata->initialized = PETSC_TRUE;
2777   } else {
2778     /* to be safe, always update pointers of the mats */
2779     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2780     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2781     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2782   }
2783 
2784   /* do cusparseSpMM, which supports transpose on B */
2785   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2786 #else
2787   PetscInt k;
2788   /* cusparseXcsrmm does not support transpose on B */
2789   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2790     cublasHandle_t cublasv2handle;
2791     cublasStatus_t cerr;
2792 
2793     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2794     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2795     PetscCallCUBLAS(cerr);
2796     blda = B->cmap->n;
2797     k    = B->cmap->n;
2798   } else {
2799     k = B->rmap->n;
2800   }
2801 
2802   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2803   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2804   PetscCallCUSPARSE(stat);
2805 #endif
2806   PetscCall(PetscLogGpuTimeEnd());
2807   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2808   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2809   if (product->type == MATPRODUCT_RARt) {
2810     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2811     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2812   } else if (product->type == MATPRODUCT_PtAP) {
2813     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2814     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2815   } else {
2816     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2817   }
2818   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2819   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2820   PetscFunctionReturn(PETSC_SUCCESS);
2821 }
2822 
2823 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2824 {
2825   Mat_Product                  *product = C->product;
2826   Mat                           A, B;
2827   PetscInt                      m, n;
2828   PetscBool                     cisdense, flg;
2829   MatProductCtx_MatMatCusparse *mmdata;
2830   Mat_SeqAIJCUSPARSE           *cusp;
2831 
2832   PetscFunctionBegin;
2833   MatCheckProduct(C, 1);
2834   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2835   A = product->A;
2836   B = product->B;
2837   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2838   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2839   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2840   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2841   switch (product->type) {
2842   case MATPRODUCT_AB:
2843     m = A->rmap->n;
2844     n = B->cmap->n;
2845     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2846     break;
2847   case MATPRODUCT_AtB:
2848     m = A->cmap->n;
2849     n = B->cmap->n;
2850     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2851     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2852     break;
2853   case MATPRODUCT_ABt:
2854     m = A->rmap->n;
2855     n = B->rmap->n;
2856     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2857     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2858     break;
2859   case MATPRODUCT_PtAP:
2860     m = B->cmap->n;
2861     n = B->cmap->n;
2862     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2863     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2864     break;
2865   case MATPRODUCT_RARt:
2866     m = B->rmap->n;
2867     n = B->rmap->n;
2868     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2869     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2870     break;
2871   default:
2872     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2873   }
2874   PetscCall(MatSetSizes(C, m, n, m, n));
2875   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2876   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2877   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2878 
2879   /* product data */
2880   PetscCall(PetscNew(&mmdata));
2881   mmdata->cisdense = cisdense;
2882 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2883   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2884   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2885 #endif
2886   /* for these products we need intermediate storage */
2887   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2888     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2889     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2890     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2891       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2892     } else {
2893       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2894     }
2895   }
2896   C->product->data    = mmdata;
2897   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2898 
2899   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2900   PetscFunctionReturn(PETSC_SUCCESS);
2901 }
2902 
2903 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2904 {
2905   Mat_Product                  *product = C->product;
2906   Mat                           A, B;
2907   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2908   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2909   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2910   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2911   PetscBool                     flg;
2912   cusparseStatus_t              stat;
2913   MatProductType                ptype;
2914   MatProductCtx_MatMatCusparse *mmdata;
2915 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2916   cusparseSpMatDescr_t BmatSpDescr;
2917 #endif
2918   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2919 
2920   PetscFunctionBegin;
2921   MatCheckProduct(C, 1);
2922   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2923   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2924   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2925   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2926   A      = product->A;
2927   B      = product->B;
2928   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2929     mmdata->reusesym = PETSC_FALSE;
2930     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2931     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2932     Cmat = Ccusp->mat;
2933     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2934     Ccsr = (CsrMatrix *)Cmat->mat;
2935     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2936     goto finalize;
2937   }
2938   if (!c->nz) goto finalize;
2939   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2940   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2941   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2942   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2943   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2944   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2946   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2947   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2948   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2949   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2950   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2951   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2952   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2953 
2954   ptype = product->type;
2955   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2956     ptype = MATPRODUCT_AB;
2957     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2958   }
2959   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2960     ptype = MATPRODUCT_AB;
2961     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2962   }
2963   switch (ptype) {
2964   case MATPRODUCT_AB:
2965     Amat = Acusp->mat;
2966     Bmat = Bcusp->mat;
2967     break;
2968   case MATPRODUCT_AtB:
2969     Amat = Acusp->matTranspose;
2970     Bmat = Bcusp->mat;
2971     break;
2972   case MATPRODUCT_ABt:
2973     Amat = Acusp->mat;
2974     Bmat = Bcusp->matTranspose;
2975     break;
2976   default:
2977     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2978   }
2979   Cmat = Ccusp->mat;
2980   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2981   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2982   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2983   Acsr = (CsrMatrix *)Amat->mat;
2984   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2985   Ccsr = (CsrMatrix *)Cmat->mat;
2986   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2987   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2988   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2989   PetscCall(PetscLogGpuTimeBegin());
2990 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2992   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2993   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2994   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2995   PetscCallCUSPARSE(stat);
2996   #else
2997   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2998   PetscCallCUSPARSE(stat);
2999   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3000   PetscCallCUSPARSE(stat);
3001   #endif
3002 #else
3003   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3004                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3005   PetscCallCUSPARSE(stat);
3006 #endif
3007   PetscCall(PetscLogGpuFlops(mmdata->flops));
3008   PetscCallCUDA(WaitForCUDA());
3009   PetscCall(PetscLogGpuTimeEnd());
3010   C->offloadmask = PETSC_OFFLOAD_GPU;
3011 finalize:
3012   /* shorter version of MatAssemblyEnd_SeqAIJ */
3013   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3014   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3015   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3016   c->reallocs = 0;
3017   C->info.mallocs += 0;
3018   C->info.nz_unneeded = 0;
3019   C->assembled = C->was_assembled = PETSC_TRUE;
3020   C->num_ass++;
3021   PetscFunctionReturn(PETSC_SUCCESS);
3022 }
3023 
3024 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3025 {
3026   Mat_Product                  *product = C->product;
3027   Mat                           A, B;
3028   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3029   Mat_SeqAIJ                   *a, *b, *c;
3030   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3031   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3032   PetscInt                      i, j, m, n, k;
3033   PetscBool                     flg;
3034   cusparseStatus_t              stat;
3035   MatProductType                ptype;
3036   MatProductCtx_MatMatCusparse *mmdata;
3037   PetscLogDouble                flops;
3038   PetscBool                     biscompressed, ciscompressed;
3039 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3040   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3041   cusparseSpMatDescr_t BmatSpDescr;
3042 #else
3043   int cnz;
3044 #endif
3045   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3046 
3047   PetscFunctionBegin;
3048   MatCheckProduct(C, 1);
3049   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3050   A = product->A;
3051   B = product->B;
3052   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3053   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3054   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3055   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3056   a = (Mat_SeqAIJ *)A->data;
3057   b = (Mat_SeqAIJ *)B->data;
3058   /* product data */
3059   PetscCall(PetscNew(&mmdata));
3060   C->product->data    = mmdata;
3061   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3062 
3063   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3064   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3065   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3066   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3067   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3068   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069 
3070   ptype = product->type;
3071   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3072     ptype                                          = MATPRODUCT_AB;
3073     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3074   }
3075   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3076     ptype                                          = MATPRODUCT_AB;
3077     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3078   }
3079   biscompressed = PETSC_FALSE;
3080   ciscompressed = PETSC_FALSE;
3081   switch (ptype) {
3082   case MATPRODUCT_AB:
3083     m    = A->rmap->n;
3084     n    = B->cmap->n;
3085     k    = A->cmap->n;
3086     Amat = Acusp->mat;
3087     Bmat = Bcusp->mat;
3088     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3089     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3090     break;
3091   case MATPRODUCT_AtB:
3092     m = A->cmap->n;
3093     n = B->cmap->n;
3094     k = A->rmap->n;
3095     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3096     Amat = Acusp->matTranspose;
3097     Bmat = Bcusp->mat;
3098     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3099     break;
3100   case MATPRODUCT_ABt:
3101     m = A->rmap->n;
3102     n = B->rmap->n;
3103     k = A->cmap->n;
3104     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3105     Amat = Acusp->mat;
3106     Bmat = Bcusp->matTranspose;
3107     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3108     break;
3109   default:
3110     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3111   }
3112 
3113   /* create cusparse matrix */
3114   PetscCall(MatSetSizes(C, m, n, m, n));
3115   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3116   c     = (Mat_SeqAIJ *)C->data;
3117   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3118   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3119   Ccsr  = new CsrMatrix;
3120 
3121   c->compressedrow.use = ciscompressed;
3122   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3123     c->compressedrow.nrows = a->compressedrow.nrows;
3124     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3125     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3126     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3127     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3128     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3129   } else {
3130     c->compressedrow.nrows  = 0;
3131     c->compressedrow.i      = NULL;
3132     c->compressedrow.rindex = NULL;
3133     Ccusp->workVector       = NULL;
3134     Cmat->cprowIndices      = NULL;
3135   }
3136   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3137   Ccusp->mat        = Cmat;
3138   Ccusp->mat->mat   = Ccsr;
3139   Ccsr->num_rows    = Ccusp->nrows;
3140   Ccsr->num_cols    = n;
3141   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3142   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3143   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3144   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3145   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3146   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3147   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3148   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3149   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3150   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3152     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3153     c->nz                = 0;
3154     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3155     Ccsr->values         = new THRUSTARRAY(c->nz);
3156     goto finalizesym;
3157   }
3158 
3159   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3160   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3161   Acsr = (CsrMatrix *)Amat->mat;
3162   if (!biscompressed) {
3163     Bcsr = (CsrMatrix *)Bmat->mat;
3164 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3165     BmatSpDescr = Bmat->matDescr;
3166 #endif
3167   } else { /* we need to use row offsets for the full matrix */
3168     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3169     Bcsr                 = new CsrMatrix;
3170     Bcsr->num_rows       = B->rmap->n;
3171     Bcsr->num_cols       = cBcsr->num_cols;
3172     Bcsr->num_entries    = cBcsr->num_entries;
3173     Bcsr->column_indices = cBcsr->column_indices;
3174     Bcsr->values         = cBcsr->values;
3175     if (!Bcusp->rowoffsets_gpu) {
3176       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3177       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3178       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3179     }
3180     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3181     mmdata->Bcsr      = Bcsr;
3182 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183     if (Bcsr->num_rows && Bcsr->num_cols) {
3184       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3185       PetscCallCUSPARSE(stat);
3186     }
3187     BmatSpDescr = mmdata->matSpBDescr;
3188 #endif
3189   }
3190   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3191   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3192   /* precompute flops count */
3193   if (ptype == MATPRODUCT_AB) {
3194     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3195       const PetscInt st = a->i[i];
3196       const PetscInt en = a->i[i + 1];
3197       for (j = st; j < en; j++) {
3198         const PetscInt brow = a->j[j];
3199         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3200       }
3201     }
3202   } else if (ptype == MATPRODUCT_AtB) {
3203     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3204       const PetscInt anzi = a->i[i + 1] - a->i[i];
3205       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3206       flops += (2. * anzi) * bnzi;
3207     }
3208   } else { /* TODO */
3209     flops = 0.;
3210   }
3211 
3212   mmdata->flops = flops;
3213   PetscCall(PetscLogGpuTimeBegin());
3214 
3215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3216   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3217   // cuda-12.2 requires non-null csrRowOffsets
3218   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3219   PetscCallCUSPARSE(stat);
3220   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3221   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3222   {
3223     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3224      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3225   */
3226     void *dBuffer1 = NULL;
3227     void *dBuffer2 = NULL;
3228     void *dBuffer3 = NULL;
3229     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3230     size_t bufferSize1 = 0;
3231     size_t bufferSize2 = 0;
3232     size_t bufferSize3 = 0;
3233     size_t bufferSize4 = 0;
3234     size_t bufferSize5 = 0;
3235 
3236     /* ask bufferSize1 bytes for external memory */
3237     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3238     PetscCallCUSPARSE(stat);
3239     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3240     /* inspect the matrices A and B to understand the memory requirement for the next step */
3241     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3242     PetscCallCUSPARSE(stat);
3243 
3244     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3245     PetscCallCUSPARSE(stat);
3246     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3247     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3248     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3249     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3250     PetscCallCUSPARSE(stat);
3251     PetscCallCUDA(cudaFree(dBuffer1));
3252     PetscCallCUDA(cudaFree(dBuffer2));
3253 
3254     /* get matrix C non-zero entries C_nnz1 */
3255     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3256     c->nz = (PetscInt)C_nnz1;
3257     /* allocate matrix C */
3258     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3259     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3260     Ccsr->values = new THRUSTARRAY(c->nz);
3261     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3262     /* update matC with the new pointers */
3263     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3264     PetscCallCUSPARSE(stat);
3265 
3266     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3267     PetscCallCUSPARSE(stat);
3268     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3269     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3270     PetscCallCUSPARSE(stat);
3271     PetscCallCUDA(cudaFree(dBuffer3));
3272     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3273     PetscCallCUSPARSE(stat);
3274     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3275   }
3276   #else
3277   size_t bufSize2;
3278   /* ask bufferSize bytes for external memory */
3279   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3280   PetscCallCUSPARSE(stat);
3281   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3282   /* inspect the matrices A and B to understand the memory requirement for the next step */
3283   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3284   PetscCallCUSPARSE(stat);
3285   /* ask bufferSize again bytes for external memory */
3286   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3287   PetscCallCUSPARSE(stat);
3288   /* The CUSPARSE documentation is not clear, nor the API
3289      We need both buffers to perform the operations properly!
3290      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3291      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3292      is stored in the descriptor! What a messy API... */
3293   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3294   /* compute the intermediate product of A * B */
3295   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3296   PetscCallCUSPARSE(stat);
3297   /* get matrix C non-zero entries C_nnz1 */
3298   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3299   c->nz = (PetscInt)C_nnz1;
3300   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3301                       mmdata->mmBufferSize / 1024));
3302   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3303   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304   Ccsr->values = new THRUSTARRAY(c->nz);
3305   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3306   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3307   PetscCallCUSPARSE(stat);
3308   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3309   PetscCallCUSPARSE(stat);
3310   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3311 #else
3312   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3313   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3314                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3315   PetscCallCUSPARSE(stat);
3316   c->nz                = cnz;
3317   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3318   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319   Ccsr->values = new THRUSTARRAY(c->nz);
3320   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3321 
3322   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3323   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3324      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3325      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3326   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3327                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3328   PetscCallCUSPARSE(stat);
3329 #endif
3330   PetscCall(PetscLogGpuFlops(mmdata->flops));
3331   PetscCall(PetscLogGpuTimeEnd());
3332 finalizesym:
3333   c->free_a = PETSC_TRUE;
3334   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3335   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3336   c->free_ij = PETSC_TRUE;
3337   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3338     PetscInt      *d_i = c->i;
3339     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3340     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3341     ii = *Ccsr->row_offsets;
3342     jj = *Ccsr->column_indices;
3343     if (ciscompressed) d_i = c->compressedrow.i;
3344     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3345     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346   } else {
3347     PetscInt *d_i = c->i;
3348     if (ciscompressed) d_i = c->compressedrow.i;
3349     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3350     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351   }
3352   if (ciscompressed) { /* need to expand host row offsets */
3353     PetscInt r = 0;
3354     c->i[0]    = 0;
3355     for (k = 0; k < c->compressedrow.nrows; k++) {
3356       const PetscInt next = c->compressedrow.rindex[k];
3357       const PetscInt old  = c->compressedrow.i[k];
3358       for (; r < next; r++) c->i[r + 1] = old;
3359     }
3360     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3361   }
3362   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3363   PetscCall(PetscMalloc1(m, &c->ilen));
3364   PetscCall(PetscMalloc1(m, &c->imax));
3365   c->maxnz         = c->nz;
3366   c->nonzerorowcnt = 0;
3367   c->rmax          = 0;
3368   for (k = 0; k < m; k++) {
3369     const PetscInt nn = c->i[k + 1] - c->i[k];
3370     c->ilen[k] = c->imax[k] = nn;
3371     c->nonzerorowcnt += (PetscInt)!!nn;
3372     c->rmax = PetscMax(c->rmax, nn);
3373   }
3374   PetscCall(PetscMalloc1(c->nz, &c->a));
3375   Ccsr->num_entries = c->nz;
3376 
3377   C->nonzerostate++;
3378   PetscCall(PetscLayoutSetUp(C->rmap));
3379   PetscCall(PetscLayoutSetUp(C->cmap));
3380   Ccusp->nonzerostate = C->nonzerostate;
3381   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3382   C->preallocated     = PETSC_TRUE;
3383   C->assembled        = PETSC_FALSE;
3384   C->was_assembled    = PETSC_FALSE;
3385   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3386     mmdata->reusesym = PETSC_TRUE;
3387     C->offloadmask   = PETSC_OFFLOAD_GPU;
3388   }
3389   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3390   PetscFunctionReturn(PETSC_SUCCESS);
3391 }
3392 
3393 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3394 
3395 /* handles sparse or dense B */
3396 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3397 {
3398   Mat_Product *product = mat->product;
3399   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3400 
3401   PetscFunctionBegin;
3402   MatCheckProduct(mat, 1);
3403   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3404   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3405   if (product->type == MATPRODUCT_ABC) {
3406     Ciscusp = PETSC_FALSE;
3407     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3408   }
3409   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3410     PetscBool usecpu = PETSC_FALSE;
3411     switch (product->type) {
3412     case MATPRODUCT_AB:
3413       if (product->api_user) {
3414         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3415         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416         PetscOptionsEnd();
3417       } else {
3418         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3419         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3420         PetscOptionsEnd();
3421       }
3422       break;
3423     case MATPRODUCT_AtB:
3424       if (product->api_user) {
3425         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3426         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427         PetscOptionsEnd();
3428       } else {
3429         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3430         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3431         PetscOptionsEnd();
3432       }
3433       break;
3434     case MATPRODUCT_PtAP:
3435       if (product->api_user) {
3436         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3437         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438         PetscOptionsEnd();
3439       } else {
3440         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3441         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3442         PetscOptionsEnd();
3443       }
3444       break;
3445     case MATPRODUCT_RARt:
3446       if (product->api_user) {
3447         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3448         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449         PetscOptionsEnd();
3450       } else {
3451         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3452         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3453         PetscOptionsEnd();
3454       }
3455       break;
3456     case MATPRODUCT_ABC:
3457       if (product->api_user) {
3458         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3459         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460         PetscOptionsEnd();
3461       } else {
3462         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3463         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3464         PetscOptionsEnd();
3465       }
3466       break;
3467     default:
3468       break;
3469     }
3470     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3471   }
3472   /* dispatch */
3473   if (isdense) {
3474     switch (product->type) {
3475     case MATPRODUCT_AB:
3476     case MATPRODUCT_AtB:
3477     case MATPRODUCT_ABt:
3478     case MATPRODUCT_PtAP:
3479     case MATPRODUCT_RARt:
3480       if (product->A->boundtocpu) {
3481         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3482       } else {
3483         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3484       }
3485       break;
3486     case MATPRODUCT_ABC:
3487       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3488       break;
3489     default:
3490       break;
3491     }
3492   } else if (Biscusp && Ciscusp) {
3493     switch (product->type) {
3494     case MATPRODUCT_AB:
3495     case MATPRODUCT_AtB:
3496     case MATPRODUCT_ABt:
3497       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3498       break;
3499     case MATPRODUCT_PtAP:
3500     case MATPRODUCT_RARt:
3501     case MATPRODUCT_ABC:
3502       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3503       break;
3504     default:
3505       break;
3506     }
3507   } else { /* fallback for AIJ */
3508     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3509   }
3510   PetscFunctionReturn(PETSC_SUCCESS);
3511 }
3512 
3513 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3514 {
3515   PetscFunctionBegin;
3516   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3517   PetscFunctionReturn(PETSC_SUCCESS);
3518 }
3519 
3520 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3521 {
3522   PetscFunctionBegin;
3523   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3524   PetscFunctionReturn(PETSC_SUCCESS);
3525 }
3526 
3527 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3528 {
3529   PetscFunctionBegin;
3530   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3531   PetscFunctionReturn(PETSC_SUCCESS);
3532 }
3533 
3534 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3535 {
3536   PetscFunctionBegin;
3537   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3538   PetscFunctionReturn(PETSC_SUCCESS);
3539 }
3540 
3541 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3542 {
3543   PetscFunctionBegin;
3544   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3545   PetscFunctionReturn(PETSC_SUCCESS);
3546 }
3547 
3548 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3549 {
3550   int i = blockIdx.x * blockDim.x + threadIdx.x;
3551   if (i < n) y[idx[i]] += x[i];
3552 }
3553 
3554 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3555 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3556 {
3557   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3558   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3559   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3560   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3561   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3562   PetscBool                     compressed;
3563 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3564   PetscInt nx, ny;
3565 #endif
3566 
3567   PetscFunctionBegin;
3568   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3569   if (!a->nz) {
3570     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3571     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3572     PetscFunctionReturn(PETSC_SUCCESS);
3573   }
3574   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3575   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3576   if (!trans) {
3577     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3578     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3579   } else {
3580     if (herm || !A->form_explicit_transpose) {
3581       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3582       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3583     } else {
3584       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3585       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3586     }
3587   }
3588   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3589   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3590 
3591   try {
3592     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3593     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3594     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3595 
3596     PetscCall(PetscLogGpuTimeBegin());
3597     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3598       /* z = A x + beta y.
3599          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3600          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3601       */
3602       xptr = xarray;
3603       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3604       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3605 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3606       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3607           allocated to accommodate different uses. So we get the length info directly from mat.
3608        */
3609       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3610         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3611         nx             = mat->num_cols; // since y = Ax
3612         ny             = mat->num_rows;
3613       }
3614 #endif
3615     } else {
3616       /* z = A^T x + beta y
3617          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3618          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3619        */
3620       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3621       dptr = zarray;
3622       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3623       if (compressed) { /* Scatter x to work vector */
3624         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3625 
3626         thrust::for_each(
3627 #if PetscDefined(HAVE_THRUST_ASYNC)
3628           thrust::cuda::par.on(PetscDefaultCudaStream),
3629 #endif
3630           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3631           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3632       }
3633 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3634       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3635         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3636         nx             = mat->num_rows; // since y = A^T x
3637         ny             = mat->num_cols;
3638       }
3639 #endif
3640     }
3641 
3642     /* csr_spmv does y = alpha op(A) x + beta y */
3643     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3644 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3645   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3646       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3647   #else
3648       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3649   #endif
3650 
3651       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3652   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3653       if (!matDescr) {
3654         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3655         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3656       }
3657   #endif
3658 
3659       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3660         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3661         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3662         PetscCallCUSPARSE(
3663           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3664         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3665   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3666         PetscCallCUSPARSE(
3667           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3668   #endif
3669         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3670       } else {
3671         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3672         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3673         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3674       }
3675 
3676       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3677 #else
3678       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3679       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3680 #endif
3681     } else {
3682       if (cusparsestruct->nrows) {
3683 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3684         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3685 #else
3686         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3687         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3688 #endif
3689       }
3690     }
3691     PetscCall(PetscLogGpuTimeEnd());
3692 
3693     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3694       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3695         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3696           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3697         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3698           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3699         }
3700       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3701         PetscCall(VecSeq_CUDA::Set(zz, 0));
3702       }
3703 
3704       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3705       if (compressed) {
3706         PetscCall(PetscLogGpuTimeBegin());
3707         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3708         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3709         PetscCall(PetscLogGpuTimeEnd());
3710       }
3711     } else {
3712       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3713     }
3714     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3715     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3716     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3717   } catch (char *ex) {
3718     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3719   }
3720   if (yy) {
3721     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3722   } else {
3723     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3724   }
3725   PetscFunctionReturn(PETSC_SUCCESS);
3726 }
3727 
3728 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3729 {
3730   PetscFunctionBegin;
3731   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3732   PetscFunctionReturn(PETSC_SUCCESS);
3733 }
3734 
3735 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
3736 
3737 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3738 {
3739   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
3740 
3741   if (x < len) {
3742     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3743     PetscScalar    d = 0.0;
3744 
3745     for (PetscInt i = 0; i < num_non0_row; i++) {
3746       if (col[i + rowx] == x) {
3747         d = val[i + rowx];
3748         break;
3749       }
3750     }
3751     diag[x] = d;
3752   }
3753 }
3754 
3755 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3756 {
3757   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3758   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3759   PetscScalar                  *darray;
3760 
3761   PetscFunctionBegin;
3762   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3763     PetscInt   n   = A->rmap->n;
3764     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3765 
3766     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3767     if (n > 0) {
3768       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3769       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3770       PetscCallCUDA(cudaPeekAtLastError());
3771       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3772     }
3773   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3774   PetscFunctionReturn(PETSC_SUCCESS);
3775 }
3776 
3777 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3778 {
3779   PetscFunctionBegin;
3780   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3781   PetscFunctionReturn(PETSC_SUCCESS);
3782 }
3783 
3784 /*@
3785   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3786 
3787   Collective
3788 
3789   Input Parameters:
3790 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3791 . m    - number of rows
3792 . n    - number of columns
3793 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3794 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3795 
3796   Output Parameter:
3797 . A - the matrix
3798 
3799   Level: intermediate
3800 
3801   Notes:
3802   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3803   calculations. For good matrix assembly performance the user should preallocate the matrix
3804   storage by setting the parameter `nz` (or the array `nnz`).
3805 
3806   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3807   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3808   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3809 
3810   The AIJ format, also called
3811   compressed row storage, is fully compatible with standard Fortran
3812   storage.  That is, the stored row and column indices can begin at
3813   either one (as in Fortran) or zero.
3814 
3815   Specify the preallocated storage with either nz or nnz (not both).
3816   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3817   allocation.
3818 
3819   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3820 
3821 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3822           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3823 @*/
3824 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3825 {
3826   PetscFunctionBegin;
3827   PetscCall(MatCreate(comm, A));
3828   PetscCall(MatSetSizes(*A, m, n, m, n));
3829   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3830   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3831   PetscFunctionReturn(PETSC_SUCCESS);
3832 }
3833 
3834 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3835 {
3836   PetscFunctionBegin;
3837   if (A->factortype == MAT_FACTOR_NONE) {
3838     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3839   } else {
3840     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3841   }
3842   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3843   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3844   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3845   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3846   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3847   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3848   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3849   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3850   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3851   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3852   PetscCall(MatDestroy_SeqAIJ(A));
3853   PetscFunctionReturn(PETSC_SUCCESS);
3854 }
3855 
3856 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3857 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3858 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3859 {
3860   PetscFunctionBegin;
3861   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3862   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3863   PetscFunctionReturn(PETSC_SUCCESS);
3864 }
3865 
3866 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3867 {
3868   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3869   Mat_SeqAIJCUSPARSE *cy;
3870   Mat_SeqAIJCUSPARSE *cx;
3871   PetscScalar        *ay;
3872   const PetscScalar  *ax;
3873   CsrMatrix          *csry, *csrx;
3874 
3875   PetscFunctionBegin;
3876   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3877   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3878   if (X->ops->axpy != Y->ops->axpy) {
3879     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3880     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3881     PetscFunctionReturn(PETSC_SUCCESS);
3882   }
3883   /* if we are here, it means both matrices are bound to GPU */
3884   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3885   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3886   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3887   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3888   csry = (CsrMatrix *)cy->mat->mat;
3889   csrx = (CsrMatrix *)cx->mat->mat;
3890   /* see if we can turn this into a cublas axpy */
3891   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3892     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3893     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3894     if (eq) str = SAME_NONZERO_PATTERN;
3895   }
3896   /* spgeam is buggy with one column */
3897   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3898 
3899   if (str == SUBSET_NONZERO_PATTERN) {
3900     PetscScalar b = 1.0;
3901 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3902     size_t bufferSize;
3903     void  *buffer;
3904 #endif
3905 
3906     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3907     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3908     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3909 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3910     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3911                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3912     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3913     PetscCall(PetscLogGpuTimeBegin());
3914     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3915                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3916     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3917     PetscCall(PetscLogGpuTimeEnd());
3918     PetscCallCUDA(cudaFree(buffer));
3919 #else
3920     PetscCall(PetscLogGpuTimeBegin());
3921     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3922                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3923     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3924     PetscCall(PetscLogGpuTimeEnd());
3925 #endif
3926     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3927     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3928     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3929   } else if (str == SAME_NONZERO_PATTERN) {
3930     cublasHandle_t cublasv2handle;
3931     PetscBLASInt   one = 1, bnz = 1;
3932 
3933     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3934     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3935     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3936     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3937     PetscCall(PetscLogGpuTimeBegin());
3938     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3939     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3940     PetscCall(PetscLogGpuTimeEnd());
3941     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3942     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3943   } else {
3944     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3945     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3946   }
3947   PetscFunctionReturn(PETSC_SUCCESS);
3948 }
3949 
3950 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3951 {
3952   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3953   PetscScalar   *ay;
3954   cublasHandle_t cublasv2handle;
3955   PetscBLASInt   one = 1, bnz = 1;
3956 
3957   PetscFunctionBegin;
3958   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3959   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3960   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3961   PetscCall(PetscLogGpuTimeBegin());
3962   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3963   PetscCall(PetscLogGpuFlops(bnz));
3964   PetscCall(PetscLogGpuTimeEnd());
3965   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3966   PetscFunctionReturn(PETSC_SUCCESS);
3967 }
3968 
3969 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3970 {
3971   PetscBool   gpu = PETSC_FALSE;
3972   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;
3973 
3974   PetscFunctionBegin;
3975   if (A->factortype == MAT_FACTOR_NONE) {
3976     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3977     if (spptr->mat) {
3978       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3979       if (matrix->values) {
3980         gpu = PETSC_TRUE;
3981         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3982       }
3983     }
3984     if (spptr->matTranspose) {
3985       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3986       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3987     }
3988   }
3989   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3990   else {
3991     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3992     A->offloadmask = PETSC_OFFLOAD_CPU;
3993   }
3994   PetscFunctionReturn(PETSC_SUCCESS);
3995 }
3996 
3997 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3998 {
3999   PetscFunctionBegin;
4000   *m = PETSC_MEMTYPE_CUDA;
4001   PetscFunctionReturn(PETSC_SUCCESS);
4002 }
4003 
4004 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4005 {
4006   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4007 
4008   PetscFunctionBegin;
4009   if (A->factortype != MAT_FACTOR_NONE) {
4010     A->boundtocpu = flg;
4011     PetscFunctionReturn(PETSC_SUCCESS);
4012   }
4013   if (flg) {
4014     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4015 
4016     A->ops->scale                     = MatScale_SeqAIJ;
4017     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4018     A->ops->axpy                      = MatAXPY_SeqAIJ;
4019     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4020     A->ops->mult                      = MatMult_SeqAIJ;
4021     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4022     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4023     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4024     A->ops->multhermitiantranspose    = NULL;
4025     A->ops->multhermitiantransposeadd = NULL;
4026     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4027     A->ops->getcurrentmemtype         = NULL;
4028     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4029     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4030     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4031     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4032     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4033     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4034     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4035   } else {
4036     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4037     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4038     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4039     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4040     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4041     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4042     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4043     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4044     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4045     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4046     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4047     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4048     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4049     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4050     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4051     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4052     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4053     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4054     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4055 
4056     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4057     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4059     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4060     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4061     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4062   }
4063   A->boundtocpu = flg;
4064   if (flg && a->inode.size_csr) {
4065     a->inode.use = PETSC_TRUE;
4066   } else {
4067     a->inode.use = PETSC_FALSE;
4068   }
4069   PetscFunctionReturn(PETSC_SUCCESS);
4070 }
4071 
4072 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4073 {
4074   Mat B;
4075 
4076   PetscFunctionBegin;
4077   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4078   if (reuse == MAT_INITIAL_MATRIX) {
4079     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4080   } else if (reuse == MAT_REUSE_MATRIX) {
4081     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4082   }
4083   B = *newmat;
4084 
4085   PetscCall(PetscFree(B->defaultvectype));
4086   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4087 
4088   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4089     if (B->factortype == MAT_FACTOR_NONE) {
4090       Mat_SeqAIJCUSPARSE *spptr;
4091       PetscCall(PetscNew(&spptr));
4092       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4093       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4094       spptr->format = MAT_CUSPARSE_CSR;
4095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4096   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4097       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4098   #else
4099       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4100   #endif
4101       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4102       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4103 #endif
4104       B->spptr = spptr;
4105     } else {
4106       Mat_SeqAIJCUSPARSETriFactors *spptr;
4107 
4108       PetscCall(PetscNew(&spptr));
4109       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4110       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4111       B->spptr = spptr;
4112     }
4113     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4114   }
4115   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4116   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4117   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4118   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4119   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4120   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4121   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4122 
4123   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4124   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4125   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4126 #if defined(PETSC_HAVE_HYPRE)
4127   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4128 #endif
4129   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4130   PetscFunctionReturn(PETSC_SUCCESS);
4131 }
4132 
4133 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4134 {
4135   PetscFunctionBegin;
4136   PetscCall(MatCreate_SeqAIJ(B));
4137   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4138   PetscFunctionReturn(PETSC_SUCCESS);
4139 }
4140 
4141 /*MC
4142    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4143 
4144    Options Database Keys:
4145 +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4146 .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4147                                            Other options include ell (ellpack) or hyb (hybrid).
4148 .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4149 -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4150 
4151   Level: beginner
4152 
4153   Notes:
4154   These matrices can be in either CSR, ELL, or HYB format.
4155 
4156   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4157 
4158   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4159   if some integer values passed in do not fit in `int`.
4160 
4161 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4162 M*/
4163 
4164 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4165 {
4166   PetscFunctionBegin;
4167   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4168   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4169   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4170   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4171   PetscFunctionReturn(PETSC_SUCCESS);
4172 }
4173 
4174 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4175 {
4176   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4177 
4178   PetscFunctionBegin;
4179   if (cusp) {
4180     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4181     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4182     delete cusp->workVector;
4183     delete cusp->rowoffsets_gpu;
4184     delete cusp->csr2csc_i;
4185     delete cusp->coords;
4186     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4187     PetscCall(PetscFree(mat->spptr));
4188   }
4189   PetscFunctionReturn(PETSC_SUCCESS);
4190 }
4191 
4192 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4193 {
4194   PetscFunctionBegin;
4195   if (*mat) {
4196     delete (*mat)->values;
4197     delete (*mat)->column_indices;
4198     delete (*mat)->row_offsets;
4199     delete *mat;
4200     *mat = 0;
4201   }
4202   PetscFunctionReturn(PETSC_SUCCESS);
4203 }
4204 
4205 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4206 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4207 {
4208   PetscFunctionBegin;
4209   if (*trifactor) {
4210     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4211     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4212     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4213     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4214     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4215   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4217   #endif
4218     PetscCall(PetscFree(*trifactor));
4219   }
4220   PetscFunctionReturn(PETSC_SUCCESS);
4221 }
4222 #endif
4223 
4224 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4225 {
4226   CsrMatrix *mat;
4227 
4228   PetscFunctionBegin;
4229   if (*matstruct) {
4230     if ((*matstruct)->mat) {
4231       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4232 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4233         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4234 #else
4235         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4236         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4237 #endif
4238       } else {
4239         mat = (CsrMatrix *)(*matstruct)->mat;
4240         PetscCall(CsrMatrix_Destroy(&mat));
4241       }
4242     }
4243     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4244     delete (*matstruct)->cprowIndices;
4245     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4246     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4247     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4248 
4249 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4250     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4251     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4252 
4253     for (int i = 0; i < 3; i++) {
4254       if (mdata->cuSpMV[i].initialized) {
4255         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4256         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4257         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4258   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4259         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4260         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4261   #endif
4262       }
4263     }
4264 #endif
4265     delete *matstruct;
4266     *matstruct = NULL;
4267   }
4268   PetscFunctionReturn(PETSC_SUCCESS);
4269 }
4270 
4271 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4272 {
4273   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4274 
4275   PetscFunctionBegin;
4276   if (fs) {
4277 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4278     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4279     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4280     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4281     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4282     delete fs->workVector;
4283     fs->workVector = NULL;
4284 #endif
4285     delete fs->rpermIndices;
4286     delete fs->cpermIndices;
4287     fs->rpermIndices  = NULL;
4288     fs->cpermIndices  = NULL;
4289     fs->init_dev_prop = PETSC_FALSE;
4290 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4291     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4292     PetscCallCUDA(cudaFree(fs->csrColIdx));
4293     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4294     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4295     PetscCallCUDA(cudaFree(fs->csrVal));
4296     PetscCallCUDA(cudaFree(fs->diag));
4297     PetscCallCUDA(cudaFree(fs->X));
4298     PetscCallCUDA(cudaFree(fs->Y));
4299     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4300     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4301     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4302     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4303     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4304     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4305     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4306     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4307     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4308     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4309     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4310     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4311     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4312     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4313     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4314     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4315     PetscCall(PetscFree(fs->csrRowPtr_h));
4316     PetscCall(PetscFree(fs->csrVal_h));
4317     PetscCall(PetscFree(fs->diag_h));
4318     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4319     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4320 #endif
4321   }
4322   PetscFunctionReturn(PETSC_SUCCESS);
4323 }
4324 
4325 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4326 {
4327   PetscFunctionBegin;
4328   if (*trifactors) {
4329     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4330     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4331     PetscCall(PetscFree(*trifactors));
4332   }
4333   PetscFunctionReturn(PETSC_SUCCESS);
4334 }
4335 
4336 struct IJCompare {
4337   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4338   {
4339     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4340     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4341     return false;
4342   }
4343 };
4344 
4345 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4346 {
4347   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4348 
4349   PetscFunctionBegin;
4350   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4351   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4352   if (destroy) {
4353     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4354     delete cusp->csr2csc_i;
4355     cusp->csr2csc_i = NULL;
4356   }
4357   A->transupdated = PETSC_FALSE;
4358   PetscFunctionReturn(PETSC_SUCCESS);
4359 }
4360 
4361 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)
4362 {
4363   MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;
4364 
4365   PetscFunctionBegin;
4366   PetscCallCUDA(cudaFree(coo->perm));
4367   PetscCallCUDA(cudaFree(coo->jmap));
4368   PetscCall(PetscFree(coo));
4369   PetscFunctionReturn(PETSC_SUCCESS);
4370 }
4371 
4372 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4373 {
4374   PetscBool            dev_ij = PETSC_FALSE;
4375   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4376   PetscInt            *i, *j;
4377   PetscContainer       container_h;
4378   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4379 
4380   PetscFunctionBegin;
4381   PetscCall(PetscGetMemType(coo_i, &mtype));
4382   if (PetscMemTypeDevice(mtype)) {
4383     dev_ij = PETSC_TRUE;
4384     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4385     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4386     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4387   } else {
4388     i = coo_i;
4389     j = coo_j;
4390   }
4391 
4392   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4393   if (dev_ij) PetscCall(PetscFree2(i, j));
4394   mat->offloadmask = PETSC_OFFLOAD_CPU;
4395   // Create the GPU memory
4396   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4397 
4398   // Copy the COO struct to device
4399   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4400   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
4401   PetscCall(PetscMalloc1(1, &coo_d));
4402   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4403   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4404   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4405   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4406   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4407 
4408   // Put the COO struct in a container and then attach that to the matrix
4409   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4410   PetscFunctionReturn(PETSC_SUCCESS);
4411 }
4412 
4413 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4414 {
4415   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4416   const PetscCount grid_size = gridDim.x * blockDim.x;
4417   for (; i < nnz; i += grid_size) {
4418     PetscScalar sum = 0.0;
4419     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4420     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4421   }
4422 }
4423 
4424 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4425 {
4426   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4427   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428   PetscCount           Annz = seq->nz;
4429   PetscMemType         memtype;
4430   const PetscScalar   *v1 = v;
4431   PetscScalar         *Aa;
4432   PetscContainer       container;
4433   MatCOOStruct_SeqAIJ *coo;
4434 
4435   PetscFunctionBegin;
4436   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4437 
4438   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4439   PetscCall(PetscContainerGetPointer(container, &coo));
4440 
4441   PetscCall(PetscGetMemType(v, &memtype));
4442   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4443     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4444     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4445   }
4446 
4447   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4448   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4449 
4450   PetscCall(PetscLogGpuTimeBegin());
4451   if (Annz) {
4452     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4453     PetscCallCUDA(cudaPeekAtLastError());
4454   }
4455   PetscCall(PetscLogGpuTimeEnd());
4456 
4457   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4458   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4459 
4460   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4461   PetscFunctionReturn(PETSC_SUCCESS);
4462 }
4463 
4464 /*@C
4465   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4466 
4467   Not Collective
4468 
4469   Input Parameters:
4470 + A          - the matrix
4471 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4472 
4473   Output Parameters:
4474 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4475 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4476 
4477   Level: developer
4478 
4479   Note:
4480   When compressed is true, the CSR structure does not contain empty rows
4481 
4482 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4483 @*/
4484 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4485 {
4486   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4487   CsrMatrix          *csr;
4488   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4489 
4490   PetscFunctionBegin;
4491   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4492   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4493   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4494   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4495   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4496   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4497   csr = (CsrMatrix *)cusp->mat->mat;
4498   if (i) {
4499     if (!compressed && a->compressedrow.use) { /* need full row offset */
4500       if (!cusp->rowoffsets_gpu) {
4501         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4502         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4503         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4504       }
4505       *i = cusp->rowoffsets_gpu->data().get();
4506     } else *i = csr->row_offsets->data().get();
4507   }
4508   if (j) *j = csr->column_indices->data().get();
4509   PetscFunctionReturn(PETSC_SUCCESS);
4510 }
4511 
4512 /*@C
4513   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4514 
4515   Not Collective
4516 
4517   Input Parameters:
4518 + A          - the matrix
4519 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4520 . i          - the CSR row pointers
4521 - j          - the CSR column indices
4522 
4523   Level: developer
4524 
4525 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4526 @*/
4527 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4528 {
4529   PetscFunctionBegin;
4530   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4531   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532   if (i) *i = NULL;
4533   if (j) *j = NULL;
4534   (void)compressed;
4535   PetscFunctionReturn(PETSC_SUCCESS);
4536 }
4537 
4538 /*@C
4539   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4540 
4541   Not Collective
4542 
4543   Input Parameter:
4544 . A - a `MATSEQAIJCUSPARSE` matrix
4545 
4546   Output Parameter:
4547 . a - pointer to the device data
4548 
4549   Level: developer
4550 
4551   Note:
4552   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4553 
4554 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4555 @*/
4556 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4557 {
4558   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559   CsrMatrix          *csr;
4560 
4561   PetscFunctionBegin;
4562   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4563   PetscAssertPointer(a, 2);
4564   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4566   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4567   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568   csr = (CsrMatrix *)cusp->mat->mat;
4569   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570   *a = csr->values->data().get();
4571   PetscFunctionReturn(PETSC_SUCCESS);
4572 }
4573 
4574 /*@C
4575   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4576 
4577   Not Collective
4578 
4579   Input Parameters:
4580 + A - a `MATSEQAIJCUSPARSE` matrix
4581 - a - pointer to the device data
4582 
4583   Level: developer
4584 
4585 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4586 @*/
4587 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4588 {
4589   PetscFunctionBegin;
4590   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4591   PetscAssertPointer(a, 2);
4592   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4593   *a = NULL;
4594   PetscFunctionReturn(PETSC_SUCCESS);
4595 }
4596 
4597 /*@C
4598   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4599 
4600   Not Collective
4601 
4602   Input Parameter:
4603 . A - a `MATSEQAIJCUSPARSE` matrix
4604 
4605   Output Parameter:
4606 . a - pointer to the device data
4607 
4608   Level: developer
4609 
4610   Note:
4611   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4612 
4613 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4614 @*/
4615 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4616 {
4617   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4618   CsrMatrix          *csr;
4619 
4620   PetscFunctionBegin;
4621   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4622   PetscAssertPointer(a, 2);
4623   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4625   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4626   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4627   csr = (CsrMatrix *)cusp->mat->mat;
4628   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4629   *a             = csr->values->data().get();
4630   A->offloadmask = PETSC_OFFLOAD_GPU;
4631   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4632   PetscFunctionReturn(PETSC_SUCCESS);
4633 }
4634 /*@C
4635   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4636 
4637   Not Collective
4638 
4639   Input Parameters:
4640 + A - a `MATSEQAIJCUSPARSE` matrix
4641 - a - pointer to the device data
4642 
4643   Level: developer
4644 
4645 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4646 @*/
4647 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4648 {
4649   PetscFunctionBegin;
4650   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4651   PetscAssertPointer(a, 2);
4652   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4653   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4654   *a = NULL;
4655   PetscFunctionReturn(PETSC_SUCCESS);
4656 }
4657 
4658 /*@C
4659   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4660 
4661   Not Collective
4662 
4663   Input Parameter:
4664 . A - a `MATSEQAIJCUSPARSE` matrix
4665 
4666   Output Parameter:
4667 . a - pointer to the device data
4668 
4669   Level: developer
4670 
4671   Note:
4672   Does not trigger any host to device copies.
4673 
4674   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4675 
4676 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4677 @*/
4678 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4679 {
4680   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4681   CsrMatrix          *csr;
4682 
4683   PetscFunctionBegin;
4684   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4685   PetscAssertPointer(a, 2);
4686   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4687   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4688   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4689   csr = (CsrMatrix *)cusp->mat->mat;
4690   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4691   *a             = csr->values->data().get();
4692   A->offloadmask = PETSC_OFFLOAD_GPU;
4693   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4694   PetscFunctionReturn(PETSC_SUCCESS);
4695 }
4696 
4697 /*@C
4698   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4699 
4700   Not Collective
4701 
4702   Input Parameters:
4703 + A - a `MATSEQAIJCUSPARSE` matrix
4704 - a - pointer to the device data
4705 
4706   Level: developer
4707 
4708 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4709 @*/
4710 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4711 {
4712   PetscFunctionBegin;
4713   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4714   PetscAssertPointer(a, 2);
4715   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4716   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4717   *a = NULL;
4718   PetscFunctionReturn(PETSC_SUCCESS);
4719 }
4720 
4721 struct IJCompare4 {
4722   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4723   {
4724     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4725     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4726     return false;
4727   }
4728 };
4729 
4730 struct Shift {
4731   int _shift;
4732 
4733   Shift(int shift) : _shift(shift) { }
4734   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4735 };
4736 
4737 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4738 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4739 {
4740   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4741   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4742   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4743   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4744   PetscInt                      Annz, Bnnz;
4745   cusparseStatus_t              stat;
4746   PetscInt                      i, m, n, zero = 0;
4747 
4748   PetscFunctionBegin;
4749   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4750   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4751   PetscAssertPointer(C, 4);
4752   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4753   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4754   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4755   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4756   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4757   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4758   if (reuse == MAT_INITIAL_MATRIX) {
4759     m = A->rmap->n;
4760     n = A->cmap->n + B->cmap->n;
4761     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4762     PetscCall(MatSetSizes(*C, m, n, m, n));
4763     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4764     c                       = (Mat_SeqAIJ *)(*C)->data;
4765     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4766     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4767     Ccsr                    = new CsrMatrix;
4768     Cmat->cprowIndices      = NULL;
4769     c->compressedrow.use    = PETSC_FALSE;
4770     c->compressedrow.nrows  = 0;
4771     c->compressedrow.i      = NULL;
4772     c->compressedrow.rindex = NULL;
4773     Ccusp->workVector       = NULL;
4774     Ccusp->nrows            = m;
4775     Ccusp->mat              = Cmat;
4776     Ccusp->mat->mat         = Ccsr;
4777     Ccsr->num_rows          = m;
4778     Ccsr->num_cols          = n;
4779     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4780     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4781     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4782     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4783     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4784     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4785     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4786     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4787     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4788     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4789     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4790     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4791     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4792 
4793     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4794     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4795     Annz                 = (PetscInt)Acsr->column_indices->size();
4796     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4797     c->nz                = Annz + Bnnz;
4798     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4799     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4800     Ccsr->values         = new THRUSTARRAY(c->nz);
4801     Ccsr->num_entries    = c->nz;
4802     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4803     if (c->nz) {
4804       auto              Acoo = new THRUSTINTARRAY32(Annz);
4805       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4806       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4807       THRUSTINTARRAY32 *Aroff, *Broff;
4808 
4809       if (a->compressedrow.use) { /* need full row offset */
4810         if (!Acusp->rowoffsets_gpu) {
4811           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4812           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4813           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4814         }
4815         Aroff = Acusp->rowoffsets_gpu;
4816       } else Aroff = Acsr->row_offsets;
4817       if (b->compressedrow.use) { /* need full row offset */
4818         if (!Bcusp->rowoffsets_gpu) {
4819           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4820           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4821           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4822         }
4823         Broff = Bcusp->rowoffsets_gpu;
4824       } else Broff = Bcsr->row_offsets;
4825       PetscCall(PetscLogGpuTimeBegin());
4826       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4827       PetscCallCUSPARSE(stat);
4828       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4829       PetscCallCUSPARSE(stat);
4830       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4831       auto Aperm = thrust::make_constant_iterator(1);
4832       auto Bperm = thrust::make_constant_iterator(0);
4833 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4834       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4835       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4836 #else
4837       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4838       auto Bcib = Bcsr->column_indices->begin();
4839       auto Bcie = Bcsr->column_indices->end();
4840       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4841 #endif
4842       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4843       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4844       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4845       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4846       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4847       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4848       auto p1    = Ccusp->coords->begin();
4849       auto p2    = Ccusp->coords->begin();
4850 #if CCCL_VERSION >= 3001000
4851       cuda::std::advance(p2, Annz);
4852 #else
4853       thrust::advance(p2, Annz);
4854 #endif
4855       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4856 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4857       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4858 #endif
4859       auto cci = thrust::make_counting_iterator(zero);
4860       auto cce = thrust::make_counting_iterator(c->nz);
4861 #if 0 //Errors on SUMMIT cuda 11.1.0
4862       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4863 #else
4864   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4865       auto pred = thrust::identity<int>();
4866   #else
4867       auto pred = cuda::std::identity();
4868   #endif
4869       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4870       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4871 #endif
4872       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4873       PetscCallCUSPARSE(stat);
4874       PetscCall(PetscLogGpuTimeEnd());
4875       delete wPerm;
4876       delete Acoo;
4877       delete Bcoo;
4878       delete Ccoo;
4879 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4880       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4881       PetscCallCUSPARSE(stat);
4882 #endif
4883       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4884         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4885         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4886         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4887         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4888         CsrMatrix                    *CcsrT = new CsrMatrix;
4889         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4890         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4891 
4892         (*C)->form_explicit_transpose = PETSC_TRUE;
4893         (*C)->transupdated            = PETSC_TRUE;
4894         Ccusp->rowoffsets_gpu         = NULL;
4895         CmatT->cprowIndices           = NULL;
4896         CmatT->mat                    = CcsrT;
4897         CcsrT->num_rows               = n;
4898         CcsrT->num_cols               = m;
4899         CcsrT->num_entries            = c->nz;
4900 
4901         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4902         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4903         CcsrT->values         = new THRUSTARRAY(c->nz);
4904 
4905         PetscCall(PetscLogGpuTimeBegin());
4906         auto rT = CcsrT->row_offsets->begin();
4907         if (AT) {
4908           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4909 #if CCCL_VERSION >= 3001000
4910           cuda::std::advance(rT, -1);
4911 #else
4912           thrust::advance(rT, -1);
4913 #endif
4914         }
4915         if (BT) {
4916           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4917           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4918           thrust::copy(titb, tite, rT);
4919         }
4920         auto cT = CcsrT->column_indices->begin();
4921         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4922         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4923         auto vT = CcsrT->values->begin();
4924         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4925         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4926         PetscCall(PetscLogGpuTimeEnd());
4927 
4928         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4929         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4930         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4931         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4932         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4933         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4934         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4935         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4936         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4937 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4938         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4939         PetscCallCUSPARSE(stat);
4940 #endif
4941         Ccusp->matTranspose = CmatT;
4942       }
4943     }
4944 
4945     c->free_a = PETSC_TRUE;
4946     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4947     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4948     c->free_ij = PETSC_TRUE;
4949     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4950       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4951       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4952       ii = *Ccsr->row_offsets;
4953       jj = *Ccsr->column_indices;
4954       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4955       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4956     } else {
4957       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4958       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4959     }
4960     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4961     PetscCall(PetscMalloc1(m, &c->ilen));
4962     PetscCall(PetscMalloc1(m, &c->imax));
4963     c->maxnz         = c->nz;
4964     c->nonzerorowcnt = 0;
4965     c->rmax          = 0;
4966     for (i = 0; i < m; i++) {
4967       const PetscInt nn = c->i[i + 1] - c->i[i];
4968       c->ilen[i] = c->imax[i] = nn;
4969       c->nonzerorowcnt += (PetscInt)!!nn;
4970       c->rmax = PetscMax(c->rmax, nn);
4971     }
4972     PetscCall(PetscMalloc1(c->nz, &c->a));
4973     (*C)->nonzerostate++;
4974     PetscCall(PetscLayoutSetUp((*C)->rmap));
4975     PetscCall(PetscLayoutSetUp((*C)->cmap));
4976     Ccusp->nonzerostate = (*C)->nonzerostate;
4977     (*C)->preallocated  = PETSC_TRUE;
4978   } else {
4979     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4980     c = (Mat_SeqAIJ *)(*C)->data;
4981     if (c->nz) {
4982       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4983       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4984       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4985       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4986       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4987       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4988       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4989       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4990       Acsr = (CsrMatrix *)Acusp->mat->mat;
4991       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4992       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4993       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4994       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4995       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4996       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4997       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4998       auto pmid = Ccusp->coords->begin();
4999 #if CCCL_VERSION >= 3001000
5000       cuda::std::advance(pmid, Acsr->num_entries);
5001 #else
5002       thrust::advance(pmid, Acsr->num_entries);
5003 #endif
5004       PetscCall(PetscLogGpuTimeBegin());
5005       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
5006       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5007       thrust::for_each(zibait, zieait, VecCUDAEquals());
5008       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5009       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5010       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5011       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5012       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5013         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5014         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5015         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5016         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5017         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5018         auto       vT    = CcsrT->values->begin();
5019         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5020         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5021         (*C)->transupdated = PETSC_TRUE;
5022       }
5023       PetscCall(PetscLogGpuTimeEnd());
5024     }
5025   }
5026   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5027   (*C)->assembled     = PETSC_TRUE;
5028   (*C)->was_assembled = PETSC_FALSE;
5029   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5030   PetscFunctionReturn(PETSC_SUCCESS);
5031 }
5032 
5033 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5034 {
5035   bool               dmem;
5036   const PetscScalar *av;
5037 
5038   PetscFunctionBegin;
5039   dmem = isCudaMem(v);
5040   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5041   if (n && idx) {
5042     THRUSTINTARRAY widx(n);
5043     widx.assign(idx, idx + n);
5044     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5045 
5046     THRUSTARRAY                    *w = NULL;
5047     thrust::device_ptr<PetscScalar> dv;
5048     if (dmem) {
5049       dv = thrust::device_pointer_cast(v);
5050     } else {
5051       w  = new THRUSTARRAY(n);
5052       dv = w->data();
5053     }
5054     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5055 
5056     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5057     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5058     thrust::for_each(zibit, zieit, VecCUDAEquals());
5059     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5060     delete w;
5061   } else {
5062     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5063   }
5064   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5065   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5066   PetscFunctionReturn(PETSC_SUCCESS);
5067 }
5068