xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision be39004295a98ced168607de69334a8b204d0588)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24   #include <cuda/std/functional>
25 #endif
26 
27 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
28 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
29 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
30 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32 
33   typedef enum {
34       CUSPARSE_MV_ALG_DEFAULT = 0,
35       CUSPARSE_COOMV_ALG      = 1,
36       CUSPARSE_CSRMV_ALG1     = 2,
37       CUSPARSE_CSRMV_ALG2     = 3
38   } cusparseSpMVAlg_t;
39 
40   typedef enum {
41       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
42       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
43       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
44       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
45       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
46       CUSPARSE_SPMM_ALG_DEFAULT = 0,
47       CUSPARSE_SPMM_COO_ALG1    = 1,
48       CUSPARSE_SPMM_COO_ALG2    = 2,
49       CUSPARSE_SPMM_COO_ALG3    = 3,
50       CUSPARSE_SPMM_COO_ALG4    = 5,
51       CUSPARSE_SPMM_CSR_ALG1    = 4,
52       CUSPARSE_SPMM_CSR_ALG2    = 6,
53   } cusparseSpMMAlg_t;
54 
55   typedef enum {
56       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
57       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
58   } cusparseCsr2CscAlg_t;
59   */
60 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
61 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
62 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
63 #endif
64 
65 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
66 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
67 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
68 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
75 #endif
76 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
77 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
78 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
79 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
84 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
85 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
86 
87 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
91 
92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
94 
95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98 
99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100 {
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
102 
103   PetscFunctionBegin;
104   switch (op) {
105   case MAT_CUSPARSE_MULT:
106     cusparsestruct->format = format;
107     break;
108   case MAT_CUSPARSE_ALL:
109     cusparsestruct->format = format;
110     break;
111   default:
112     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113   }
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
117 /*@
118   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
119   operation. Only the `MatMult()` operation can use different GPU storage formats
120 
121   Not Collective
122 
123   Input Parameters:
124 + A      - Matrix of type `MATSEQAIJCUSPARSE`
125 . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
126         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
127 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128 
129   Level: intermediate
130 
131 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
132 @*/
133 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
134 {
135   PetscFunctionBegin;
136   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
137   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
138   PetscFunctionReturn(PETSC_SUCCESS);
139 }
140 
141 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
142 {
143   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
144 
145   PetscFunctionBegin;
146   cusparsestruct->use_cpu_solve = use_cpu;
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 /*@
151   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
152 
153   Input Parameters:
154 + A       - Matrix of type `MATSEQAIJCUSPARSE`
155 - use_cpu - set flag for using the built-in CPU `MatSolve()`
156 
157   Level: intermediate
158 
159   Note:
160   The cuSparse LU solver currently computes the factors with the built-in CPU method
161   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
162   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
163 
164 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
165 @*/
166 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
167 {
168   PetscFunctionBegin;
169   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
170   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
174 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
175 {
176   PetscFunctionBegin;
177   switch (op) {
178   case MAT_FORM_EXPLICIT_TRANSPOSE:
179     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
180     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
181     A->form_explicit_transpose = flg;
182     break;
183   default:
184     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
185     break;
186   }
187   PetscFunctionReturn(PETSC_SUCCESS);
188 }
189 
190 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
191 {
192   MatCUSPARSEStorageFormat format;
193   PetscBool                flg;
194   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
195 
196   PetscFunctionBegin;
197   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
198   if (A->factortype == MAT_FACTOR_NONE) {
199     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
201 
202     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
203     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
204     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
205     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
206 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
207     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
208     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
209   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
210     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211   #else
212     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
213   #endif
214     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
215     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
216 
217     PetscCall(
218       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
219     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
220 #endif
221   }
222   PetscOptionsHeadEnd();
223   PetscFunctionReturn(PETSC_SUCCESS);
224 }
225 
226 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
227 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
228 {
229   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
230   PetscInt                      m  = A->rmap->n;
231   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
232   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
233   const MatScalar              *Aa = a->a;
234   PetscInt                     *Mi, *Mj, Mnz;
235   PetscScalar                  *Ma;
236 
237   PetscFunctionBegin;
238   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
239     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
240       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
241       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
242       PetscCall(PetscMalloc1(m + 1, &Mi));
243       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
244       PetscCall(PetscMalloc1(Mnz, &Ma));
245       Mi[0] = 0;
246       for (PetscInt i = 0; i < m; i++) {
247         PetscInt llen = Ai[i + 1] - Ai[i];
248         PetscInt ulen = Adiag[i] - Adiag[i + 1];
249         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
250         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
251         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
252         Mi[i + 1] = Mi[i] + llen + ulen;
253       }
254       // Copy M (L,U) from host to device
255       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
256       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
257       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
258       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
259       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
260 
261       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
262       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
263       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
264       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
265       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
266       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
267       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
268       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
269 
270       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
271       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
272       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
273 
274       fillMode = CUSPARSE_FILL_MODE_UPPER;
275       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
276       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
277       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
278       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
279 
280       // Allocate work vectors in SpSv
281       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
282       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
283 
284       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
285       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
286 
287       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
288       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
289       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
290       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
291       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
292       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
293       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
294 
295       // Record for reuse
296       fs->csrRowPtr_h = Mi;
297       fs->csrVal_h    = Ma;
298       PetscCall(PetscFree(Mj));
299     }
300     // Copy the value
301     Mi  = fs->csrRowPtr_h;
302     Ma  = fs->csrVal_h;
303     Mnz = Mi[m];
304     for (PetscInt i = 0; i < m; i++) {
305       PetscInt llen = Ai[i + 1] - Ai[i];
306       PetscInt ulen = Adiag[i] - Adiag[i + 1];
307       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
308       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
309       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
310     }
311     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
312 
313   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
314     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
315       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
316       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
318     } else
319   #endif
320     {
321       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
322       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
323 
324       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
325       fs->updatedSpSVAnalysis          = PETSC_TRUE;
326       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
327     }
328   }
329   PetscFunctionReturn(PETSC_SUCCESS);
330 }
331 #else
332 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
333 {
334   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
335   PetscInt                           n                  = A->rmap->n;
336   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
337   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
338   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
339   const MatScalar                   *aa = a->a, *v;
340   PetscInt                          *AiLo, *AjLo;
341   PetscInt                           i, nz, nzLower, offset, rowOffset;
342 
343   PetscFunctionBegin;
344   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
345   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
346     try {
347       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
348       nzLower = n + ai[n] - ai[1];
349       if (!loTriFactor) {
350         PetscScalar *AALo;
351 
352         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
353 
354         /* Allocate Space for the lower triangular matrix */
355         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
356         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
357 
358         /* Fill the lower triangular matrix */
359         AiLo[0]   = (PetscInt)0;
360         AiLo[n]   = nzLower;
361         AjLo[0]   = (PetscInt)0;
362         AALo[0]   = (MatScalar)1.0;
363         v         = aa;
364         vi        = aj;
365         offset    = 1;
366         rowOffset = 1;
367         for (i = 1; i < n; i++) {
368           nz = ai[i + 1] - ai[i];
369           /* additional 1 for the term on the diagonal */
370           AiLo[i] = rowOffset;
371           rowOffset += nz + 1;
372 
373           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
374           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
375 
376           offset += nz;
377           AjLo[offset] = (PetscInt)i;
378           AALo[offset] = (MatScalar)1.0;
379           offset += 1;
380 
381           v += nz;
382           vi += nz;
383         }
384 
385         /* allocate space for the triangular factor information */
386         PetscCall(PetscNew(&loTriFactor));
387         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
388         /* Create the matrix description */
389         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
390         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
391   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
392         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
393   #else
394         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
395   #endif
396         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
397         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
398 
399         /* set the operation */
400         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401 
402         /* set the matrix */
403         loTriFactor->csrMat              = new CsrMatrix;
404         loTriFactor->csrMat->num_rows    = n;
405         loTriFactor->csrMat->num_cols    = n;
406         loTriFactor->csrMat->num_entries = nzLower;
407 
408         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
409         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
410 
411         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
412         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
413 
414         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
415         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
416 
417         /* Create the solve analysis information */
418         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
419         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
420   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
421         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
422                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
423         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
424   #endif
425 
426         /* perform the solve analysis */
427         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
428                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
429         PetscCallCUDA(WaitForCUDA());
430         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
431 
432         /* assign the pointer */
433         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
434         loTriFactor->AA_h                                          = AALo;
435         PetscCallCUDA(cudaFreeHost(AiLo));
436         PetscCallCUDA(cudaFreeHost(AjLo));
437         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
438       } else { /* update values only */
439         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
440         /* Fill the lower triangular matrix */
441         loTriFactor->AA_h[0] = 1.0;
442         v                    = aa;
443         vi                   = aj;
444         offset               = 1;
445         for (i = 1; i < n; i++) {
446           nz = ai[i + 1] - ai[i];
447           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
448           offset += nz;
449           loTriFactor->AA_h[offset] = 1.0;
450           offset += 1;
451           v += nz;
452         }
453         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
454         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
455       }
456     } catch (char *ex) {
457       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
458     }
459   }
460   PetscFunctionReturn(PETSC_SUCCESS);
461 }
462 
463 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
464 {
465   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
466   PetscInt                           n                  = A->rmap->n;
467   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
468   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
469   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
470   const MatScalar                   *aa = a->a, *v;
471   PetscInt                          *AiUp, *AjUp;
472   PetscInt                           i, nz, nzUpper, offset;
473 
474   PetscFunctionBegin;
475   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
476   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
477     try {
478       /* next, figure out the number of nonzeros in the upper triangular matrix. */
479       nzUpper = adiag[0] - adiag[n];
480       if (!upTriFactor) {
481         PetscScalar *AAUp;
482 
483         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
484 
485         /* Allocate Space for the upper triangular matrix */
486         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
487         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
488 
489         /* Fill the upper triangular matrix */
490         AiUp[0] = (PetscInt)0;
491         AiUp[n] = nzUpper;
492         offset  = nzUpper;
493         for (i = n - 1; i >= 0; i--) {
494           v  = aa + adiag[i + 1] + 1;
495           vi = aj + adiag[i + 1] + 1;
496 
497           /* number of elements NOT on the diagonal */
498           nz = adiag[i] - adiag[i + 1] - 1;
499 
500           /* decrement the offset */
501           offset -= (nz + 1);
502 
503           /* first, set the diagonal elements */
504           AjUp[offset] = (PetscInt)i;
505           AAUp[offset] = (MatScalar)1. / v[nz];
506           AiUp[i]      = AiUp[i + 1] - (nz + 1);
507 
508           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
509           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
510         }
511 
512         /* allocate space for the triangular factor information */
513         PetscCall(PetscNew(&upTriFactor));
514         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
515 
516         /* Create the matrix description */
517         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
518         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
519   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
520         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
521   #else
522         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
523   #endif
524         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
525         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
526 
527         /* set the operation */
528         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
529 
530         /* set the matrix */
531         upTriFactor->csrMat              = new CsrMatrix;
532         upTriFactor->csrMat->num_rows    = n;
533         upTriFactor->csrMat->num_cols    = n;
534         upTriFactor->csrMat->num_entries = nzUpper;
535 
536         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
537         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
538 
539         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
540         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
541 
542         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
543         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
544 
545         /* Create the solve analysis information */
546         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
547         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
548   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
549         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
550                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
551         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
552   #endif
553 
554         /* perform the solve analysis */
555         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
556                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
557 
558         PetscCallCUDA(WaitForCUDA());
559         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
560 
561         /* assign the pointer */
562         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
563         upTriFactor->AA_h                                          = AAUp;
564         PetscCallCUDA(cudaFreeHost(AiUp));
565         PetscCallCUDA(cudaFreeHost(AjUp));
566         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
567       } else {
568         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
569         /* Fill the upper triangular matrix */
570         offset = nzUpper;
571         for (i = n - 1; i >= 0; i--) {
572           v = aa + adiag[i + 1] + 1;
573 
574           /* number of elements NOT on the diagonal */
575           nz = adiag[i] - adiag[i + 1] - 1;
576 
577           /* decrement the offset */
578           offset -= (nz + 1);
579 
580           /* first, set the diagonal elements */
581           upTriFactor->AA_h[offset] = 1. / v[nz];
582           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
583         }
584         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
585         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
586       }
587     } catch (char *ex) {
588       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
589     }
590   }
591   PetscFunctionReturn(PETSC_SUCCESS);
592 }
593 #endif
594 
595 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
596 {
597   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
598   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
599   IS                            isrow = a->row, isicol = a->icol;
600   PetscBool                     row_identity, col_identity;
601   PetscInt                      n = A->rmap->n;
602 
603   PetscFunctionBegin;
604   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
605 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
606   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
607 #else
608   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
609   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
610   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
611 #endif
612 
613   cusparseTriFactors->nnz = a->nz;
614 
615   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
616   /* lower triangular indices */
617   PetscCall(ISIdentity(isrow, &row_identity));
618   if (!row_identity && !cusparseTriFactors->rpermIndices) {
619     const PetscInt *r;
620 
621     PetscCall(ISGetIndices(isrow, &r));
622     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
623     cusparseTriFactors->rpermIndices->assign(r, r + n);
624     PetscCall(ISRestoreIndices(isrow, &r));
625     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
626   }
627 
628   /* upper triangular indices */
629   PetscCall(ISIdentity(isicol, &col_identity));
630   if (!col_identity && !cusparseTriFactors->cpermIndices) {
631     const PetscInt *c;
632 
633     PetscCall(ISGetIndices(isicol, &c));
634     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
635     cusparseTriFactors->cpermIndices->assign(c, c + n);
636     PetscCall(ISRestoreIndices(isicol, &c));
637     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
638   }
639   PetscFunctionReturn(PETSC_SUCCESS);
640 }
641 
642 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
643 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
644 {
645   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
646   PetscInt                      m  = A->rmap->n;
647   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
648   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
649   const MatScalar              *Aa = a->a;
650   PetscInt                     *Mj, Mnz;
651   PetscScalar                  *Ma, *D;
652 
653   PetscFunctionBegin;
654   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
655     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
656       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
657       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
658       Mnz = Ai[m]; // Unz (with the unit diagonal)
659       PetscCall(PetscMalloc1(Mnz, &Ma));
660       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
661       PetscCall(PetscMalloc1(m, &D));    // the diagonal
662       for (PetscInt i = 0; i < m; i++) {
663         PetscInt ulen = Ai[i + 1] - Ai[i];
664         Mj[Ai[i]]     = i;                                              // diagonal entry
665         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
666       }
667       // Copy M (U) from host to device
668       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
669       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
670       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
671       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
672       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
673       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
674 
675       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
676       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
677       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
678       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
679       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
680       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
681       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
682       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
683 
684       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
685       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
686       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
687 
688       // Allocate work vectors in SpSv
689       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
690       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
691 
692       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
693       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
694 
695       // Query buffer sizes for SpSV and then allocate buffers
696       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
697       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
698       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
699 
700       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
701       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
702       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
703 
704       // Record for reuse
705       fs->csrVal_h = Ma;
706       fs->diag_h   = D;
707       PetscCall(PetscFree(Mj));
708     }
709     // Copy the value
710     Ma  = fs->csrVal_h;
711     D   = fs->diag_h;
712     Mnz = Ai[m];
713     for (PetscInt i = 0; i < m; i++) {
714       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
715       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
716       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
717     }
718     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
719     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
720 
721   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
722     if (fs->updatedSpSVAnalysis) {
723       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
725     } else
726   #endif
727     {
728       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
729       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
730       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
731       fs->updatedSpSVAnalysis = PETSC_TRUE;
732     }
733   }
734   PetscFunctionReturn(PETSC_SUCCESS);
735 }
736 
737 // Solve Ut D U x = b
738 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
739 {
740   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
741   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
742   const PetscScalar                    *barray;
743   PetscScalar                          *xarray;
744   thrust::device_ptr<const PetscScalar> bGPU;
745   thrust::device_ptr<PetscScalar>       xGPU;
746   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
747   PetscInt                              m   = A->rmap->n;
748 
749   PetscFunctionBegin;
750   PetscCall(PetscLogGpuTimeBegin());
751   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
752   PetscCall(VecCUDAGetArrayRead(b, &barray));
753   xGPU = thrust::device_pointer_cast(xarray);
754   bGPU = thrust::device_pointer_cast(barray);
755 
756   // Reorder b with the row permutation if needed, and wrap the result in fs->X
757   if (fs->rpermIndices) {
758     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
759     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
760   } else {
761     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
762   }
763 
764   // Solve Ut Y = X
765   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
766   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
767 
768   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
769   // It is basically a vector element-wise multiplication, but cublas does not have it!
770   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
771 
772   // Solve U X = Y
773   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
774     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
775   } else {
776     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
777   }
778   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
779 
780   // Reorder X with the column permutation if needed, and put the result back to x
781   if (fs->cpermIndices) {
782     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
783                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
784   }
785 
786   PetscCall(VecCUDARestoreArrayRead(b, &barray));
787   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
788   PetscCall(PetscLogGpuTimeEnd());
789   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
790   PetscFunctionReturn(PETSC_SUCCESS);
791 }
792 #else
793 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
794 {
795   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
796   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
797   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
798   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
799   PetscInt                          *AiUp, *AjUp;
800   PetscScalar                       *AAUp;
801   PetscScalar                       *AALo;
802   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
803   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
804   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
805   const MatScalar                   *aa = b->a, *v;
806 
807   PetscFunctionBegin;
808   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
809   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
810     try {
811       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
812       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
813       if (!upTriFactor && !loTriFactor) {
814         /* Allocate Space for the upper triangular matrix */
815         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
816         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
817 
818         /* Fill the upper triangular matrix */
819         AiUp[0] = (PetscInt)0;
820         AiUp[n] = nzUpper;
821         offset  = 0;
822         for (i = 0; i < n; i++) {
823           /* set the pointers */
824           v  = aa + ai[i];
825           vj = aj + ai[i];
826           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
827 
828           /* first, set the diagonal elements */
829           AjUp[offset] = (PetscInt)i;
830           AAUp[offset] = (MatScalar)1.0 / v[nz];
831           AiUp[i]      = offset;
832           AALo[offset] = (MatScalar)1.0 / v[nz];
833 
834           offset += 1;
835           if (nz > 0) {
836             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
837             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
838             for (j = offset; j < offset + nz; j++) {
839               AAUp[j] = -AAUp[j];
840               AALo[j] = AAUp[j] / v[nz];
841             }
842             offset += nz;
843           }
844         }
845 
846         /* allocate space for the triangular factor information */
847         PetscCall(PetscNew(&upTriFactor));
848         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
849 
850         /* Create the matrix description */
851         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
852         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
853   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
854         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
855   #else
856         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
857   #endif
858         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
859         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
860 
861         /* set the matrix */
862         upTriFactor->csrMat              = new CsrMatrix;
863         upTriFactor->csrMat->num_rows    = A->rmap->n;
864         upTriFactor->csrMat->num_cols    = A->cmap->n;
865         upTriFactor->csrMat->num_entries = a->nz;
866 
867         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
868         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
869 
870         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
871         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
872 
873         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
874         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
875 
876         /* set the operation */
877         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
878 
879         /* Create the solve analysis information */
880         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
881         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
882   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
883         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
884                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
885         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
886   #endif
887 
888         /* perform the solve analysis */
889         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
890                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
891 
892         PetscCallCUDA(WaitForCUDA());
893         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
894 
895         /* assign the pointer */
896         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
897 
898         /* allocate space for the triangular factor information */
899         PetscCall(PetscNew(&loTriFactor));
900         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
901 
902         /* Create the matrix description */
903         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
904         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
905   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
906         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
907   #else
908         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
909   #endif
910         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
911         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
912 
913         /* set the operation */
914         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
915 
916         /* set the matrix */
917         loTriFactor->csrMat              = new CsrMatrix;
918         loTriFactor->csrMat->num_rows    = A->rmap->n;
919         loTriFactor->csrMat->num_cols    = A->cmap->n;
920         loTriFactor->csrMat->num_entries = a->nz;
921 
922         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
923         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
924 
925         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
926         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
927 
928         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
929         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
930 
931         /* Create the solve analysis information */
932         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
933         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
934   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
935         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
936                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
937         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
938   #endif
939 
940         /* perform the solve analysis */
941         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
942                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
943 
944         PetscCallCUDA(WaitForCUDA());
945         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
946 
947         /* assign the pointer */
948         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
949 
950         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
951         PetscCallCUDA(cudaFreeHost(AiUp));
952         PetscCallCUDA(cudaFreeHost(AjUp));
953       } else {
954         /* Fill the upper triangular matrix */
955         offset = 0;
956         for (i = 0; i < n; i++) {
957           /* set the pointers */
958           v  = aa + ai[i];
959           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
960 
961           /* first, set the diagonal elements */
962           AAUp[offset] = 1.0 / v[nz];
963           AALo[offset] = 1.0 / v[nz];
964 
965           offset += 1;
966           if (nz > 0) {
967             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
968             for (j = offset; j < offset + nz; j++) {
969               AAUp[j] = -AAUp[j];
970               AALo[j] = AAUp[j] / v[nz];
971             }
972             offset += nz;
973           }
974         }
975         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
976         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
977         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
978         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
979         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
980       }
981       PetscCallCUDA(cudaFreeHost(AAUp));
982       PetscCallCUDA(cudaFreeHost(AALo));
983     } catch (char *ex) {
984       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
985     }
986   }
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 #endif
990 
991 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
992 {
993   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
994   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
995   IS                            ip                 = a->row;
996   PetscBool                     perm_identity;
997   PetscInt                      n = A->rmap->n;
998 
999   PetscFunctionBegin;
1000   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
1001 
1002 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1003   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1004 #else
1005   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1006   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1007 #endif
1008   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1009 
1010   A->offloadmask = PETSC_OFFLOAD_BOTH;
1011 
1012   /* lower triangular indices */
1013   PetscCall(ISIdentity(ip, &perm_identity));
1014   if (!perm_identity) {
1015     IS              iip;
1016     const PetscInt *irip, *rip;
1017 
1018     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1019     PetscCall(ISGetIndices(iip, &irip));
1020     PetscCall(ISGetIndices(ip, &rip));
1021     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1022     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1023     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1024     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1025     PetscCall(ISRestoreIndices(iip, &irip));
1026     PetscCall(ISDestroy(&iip));
1027     PetscCall(ISRestoreIndices(ip, &rip));
1028     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1029   }
1030   PetscFunctionReturn(PETSC_SUCCESS);
1031 }
1032 
1033 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1034 {
1035   PetscFunctionBegin;
1036   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1037   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1038   B->offloadmask = PETSC_OFFLOAD_CPU;
1039 
1040 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1041   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1043 #else
1044   /* determine which version of MatSolve needs to be used. */
1045   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1046   IS          ip = b->row;
1047   PetscBool   perm_identity;
1048 
1049   PetscCall(ISIdentity(ip, &perm_identity));
1050   if (perm_identity) {
1051     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1052     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1053   } else {
1054     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1055     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1056   }
1057 #endif
1058   B->ops->matsolve          = NULL;
1059   B->ops->matsolvetranspose = NULL;
1060 
1061   /* get the triangular factors */
1062   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1063   PetscFunctionReturn(PETSC_SUCCESS);
1064 }
1065 
1066 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1067 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1068 {
1069   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1070   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1071   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1072   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1073   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1074   cusparseIndexBase_t                indexBase;
1075   cusparseMatrixType_t               matrixType;
1076   cusparseFillMode_t                 fillMode;
1077   cusparseDiagType_t                 diagType;
1078 
1079   PetscFunctionBegin;
1080   /* allocate space for the transpose of the lower triangular factor */
1081   PetscCall(PetscNew(&loTriFactorT));
1082   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1083 
1084   /* set the matrix descriptors of the lower triangular factor */
1085   matrixType = cusparseGetMatType(loTriFactor->descr);
1086   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1087   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1088   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1089 
1090   /* Create the matrix description */
1091   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1092   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1093   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1094   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1095   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1096 
1097   /* set the operation */
1098   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1099 
1100   /* allocate GPU space for the CSC of the lower triangular factor*/
1101   loTriFactorT->csrMat                 = new CsrMatrix;
1102   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1103   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1104   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1105   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1106   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1107   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1108 
1109   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1110   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1111   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1112                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1113                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1114   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1115   #endif
1116 
1117   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1118   {
1119     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1120     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1121                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1122   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1123                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1124   #else
1125                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1126   #endif
1127     PetscCallCUSPARSE(stat);
1128   }
1129 
1130   PetscCallCUDA(WaitForCUDA());
1131   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1132 
1133   /* Create the solve analysis information */
1134   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1135   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1136   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1137   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1138                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1139   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1140   #endif
1141 
1142   /* perform the solve analysis */
1143   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1144                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1145 
1146   PetscCallCUDA(WaitForCUDA());
1147   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1148 
1149   /* assign the pointer */
1150   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1151 
1152   /*********************************************/
1153   /* Now the Transpose of the Upper Tri Factor */
1154   /*********************************************/
1155 
1156   /* allocate space for the transpose of the upper triangular factor */
1157   PetscCall(PetscNew(&upTriFactorT));
1158   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1159 
1160   /* set the matrix descriptors of the upper triangular factor */
1161   matrixType = cusparseGetMatType(upTriFactor->descr);
1162   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1163   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1164   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1165 
1166   /* Create the matrix description */
1167   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1168   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1169   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1170   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1171   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1172 
1173   /* set the operation */
1174   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1175 
1176   /* allocate GPU space for the CSC of the upper triangular factor*/
1177   upTriFactorT->csrMat                 = new CsrMatrix;
1178   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1179   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1180   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1181   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1182   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1183   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1184 
1185   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1186   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1187   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1188                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1189                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1190   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1191   #endif
1192 
1193   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1194   {
1195     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1196     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1197                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1198   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1199                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1200   #else
1201                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1202   #endif
1203     PetscCallCUSPARSE(stat);
1204   }
1205 
1206   PetscCallCUDA(WaitForCUDA());
1207   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1208 
1209   /* Create the solve analysis information */
1210   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1211   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1212   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1213   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1214                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1215   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1216   #endif
1217 
1218   /* perform the solve analysis */
1219   /* christ, would it have killed you to put this stuff in a function????????? */
1220   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1222 
1223   PetscCallCUDA(WaitForCUDA());
1224   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1225 
1226   /* assign the pointer */
1227   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1228   PetscFunctionReturn(PETSC_SUCCESS);
1229 }
1230 #endif
1231 
1232 struct PetscScalarToPetscInt {
1233   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1234 };
1235 
1236 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1237 {
1238   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1239   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1240   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1241   cusparseStatus_t              stat;
1242   cusparseIndexBase_t           indexBase;
1243 
1244   PetscFunctionBegin;
1245   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1246   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1247   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1248   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1249   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1250   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1251   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1252   PetscCall(PetscLogGpuTimeBegin());
1253   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1254   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1255     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1256     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1257     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1259     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1260 
1261     /* set alpha and beta */
1262     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1263     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1264     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1265     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1267     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1268 
1269     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1270       CsrMatrix *matrixT      = new CsrMatrix;
1271       matstructT->mat         = matrixT;
1272       matrixT->num_rows       = A->cmap->n;
1273       matrixT->num_cols       = A->rmap->n;
1274       matrixT->num_entries    = a->nz;
1275       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1276       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1277       matrixT->values         = new THRUSTARRAY(a->nz);
1278 
1279       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1280       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1281 
1282 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1283   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1284       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1285                                indexBase, cusparse_scalartype);
1286       PetscCallCUSPARSE(stat);
1287   #else
1288       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1289            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1290 
1291            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1292            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1293            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1294         */
1295       if (matrixT->num_entries) {
1296         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1297         PetscCallCUSPARSE(stat);
1298 
1299       } else {
1300         matstructT->matDescr = NULL;
1301         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1302       }
1303   #endif
1304 #endif
1305     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1306 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1307       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1308 #else
1309       CsrMatrix *temp  = new CsrMatrix;
1310       CsrMatrix *tempT = new CsrMatrix;
1311       /* First convert HYB to CSR */
1312       temp->num_rows       = A->rmap->n;
1313       temp->num_cols       = A->cmap->n;
1314       temp->num_entries    = a->nz;
1315       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1316       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1317       temp->values         = new THRUSTARRAY(a->nz);
1318 
1319       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1320       PetscCallCUSPARSE(stat);
1321 
1322       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1323       tempT->num_rows       = A->rmap->n;
1324       tempT->num_cols       = A->cmap->n;
1325       tempT->num_entries    = a->nz;
1326       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1327       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1328       tempT->values         = new THRUSTARRAY(a->nz);
1329 
1330       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1331                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1332       PetscCallCUSPARSE(stat);
1333 
1334       /* Last, convert CSC to HYB */
1335       cusparseHybMat_t hybMat;
1336       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1337       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1338       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1339       PetscCallCUSPARSE(stat);
1340 
1341       /* assign the pointer */
1342       matstructT->mat = hybMat;
1343       A->transupdated = PETSC_TRUE;
1344       /* delete temporaries */
1345       if (tempT) {
1346         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1347         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1348         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1349         delete (CsrMatrix *)tempT;
1350       }
1351       if (temp) {
1352         if (temp->values) delete (THRUSTARRAY *)temp->values;
1353         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1354         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1355         delete (CsrMatrix *)temp;
1356       }
1357 #endif
1358     }
1359   }
1360   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1361     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1362     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1363     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1364     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1365     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1366     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1367     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1368     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1369     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1370     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1371     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1372       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1373       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1374       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1375     }
1376     if (!cusparsestruct->csr2csc_i) {
1377       THRUSTARRAY csr2csc_a(matrix->num_entries);
1378       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1379 
1380       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1381 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1382       void  *csr2cscBuffer;
1383       size_t csr2cscBufferSize;
1384       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1385                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1386       PetscCallCUSPARSE(stat);
1387       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1388 #endif
1389 
1390       if (matrix->num_entries) {
1391         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1392            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1393            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1394 
1395            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1396            should be filled with indexBase. So I just take a shortcut here.
1397         */
1398         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1399 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1400                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1401         PetscCallCUSPARSE(stat);
1402 #else
1403                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1404         PetscCallCUSPARSE(stat);
1405 #endif
1406       } else {
1407         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1408       }
1409 
1410       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1411       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1412 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1413       PetscCallCUDA(cudaFree(csr2cscBuffer));
1414 #endif
1415     }
1416     PetscCallThrust(
1417       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1418   }
1419   PetscCall(PetscLogGpuTimeEnd());
1420   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1421   /* the compressed row indices is not used for matTranspose */
1422   matstructT->cprowIndices = NULL;
1423   /* assign the pointer */
1424   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1425   A->transupdated                                = PETSC_TRUE;
1426   PetscFunctionReturn(PETSC_SUCCESS);
1427 }
1428 
1429 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1430 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1431 {
1432   const PetscScalar                    *barray;
1433   PetscScalar                          *xarray;
1434   thrust::device_ptr<const PetscScalar> bGPU;
1435   thrust::device_ptr<PetscScalar>       xGPU;
1436   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1437   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1438   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1440   PetscInt                              m   = A->rmap->n;
1441 
1442   PetscFunctionBegin;
1443   PetscCall(PetscLogGpuTimeBegin());
1444   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1445   PetscCall(VecCUDAGetArrayRead(b, &barray));
1446   xGPU = thrust::device_pointer_cast(xarray);
1447   bGPU = thrust::device_pointer_cast(barray);
1448 
1449   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1450   if (fs->rpermIndices) {
1451     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1452     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1453   } else {
1454     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1455   }
1456 
1457   // Solve L Y = X
1458   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1459   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1460   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1461 
1462   // Solve U X = Y
1463   if (fs->cpermIndices) {
1464     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1465   } else {
1466     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1467   }
1468   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1469 
1470   // Reorder X with the column permutation if needed, and put the result back to x
1471   if (fs->cpermIndices) {
1472     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1473                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1474   }
1475   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1476   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1477   PetscCall(PetscLogGpuTimeEnd());
1478   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1479   PetscFunctionReturn(PETSC_SUCCESS);
1480 }
1481 
1482 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1483 {
1484   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1485   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1486   const PetscScalar                    *barray;
1487   PetscScalar                          *xarray;
1488   thrust::device_ptr<const PetscScalar> bGPU;
1489   thrust::device_ptr<PetscScalar>       xGPU;
1490   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1491   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1492   PetscInt                              m   = A->rmap->n;
1493 
1494   PetscFunctionBegin;
1495   PetscCall(PetscLogGpuTimeBegin());
1496   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1497     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1498     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1499                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1500 
1501     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1502     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1503     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1504     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1505     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1506   }
1507 
1508   if (!fs->updatedTransposeSpSVAnalysis) {
1509     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1510 
1511     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1512     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1513   }
1514 
1515   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1516   PetscCall(VecCUDAGetArrayRead(b, &barray));
1517   xGPU = thrust::device_pointer_cast(xarray);
1518   bGPU = thrust::device_pointer_cast(barray);
1519 
1520   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1521   if (fs->rpermIndices) {
1522     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1523     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1524   } else {
1525     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1526   }
1527 
1528   // Solve Ut Y = X
1529   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1530   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1531 
1532   // Solve Lt X = Y
1533   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1534     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1535   } else {
1536     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1537   }
1538   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1539 
1540   // Reorder X with the column permutation if needed, and put the result back to x
1541   if (fs->cpermIndices) {
1542     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1543                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1544   }
1545 
1546   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1547   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1548   PetscCall(PetscLogGpuTimeEnd());
1549   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1550   PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552 #else
1553 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1554 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1555 {
1556   PetscInt                              n = xx->map->n;
1557   const PetscScalar                    *barray;
1558   PetscScalar                          *xarray;
1559   thrust::device_ptr<const PetscScalar> bGPU;
1560   thrust::device_ptr<PetscScalar>       xGPU;
1561   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1562   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1563   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1564   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1565 
1566   PetscFunctionBegin;
1567   /* Analyze the matrix and create the transpose ... on the fly */
1568   if (!loTriFactorT && !upTriFactorT) {
1569     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1570     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1571     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1572   }
1573 
1574   /* Get the GPU pointers */
1575   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1576   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1577   xGPU = thrust::device_pointer_cast(xarray);
1578   bGPU = thrust::device_pointer_cast(barray);
1579 
1580   PetscCall(PetscLogGpuTimeBegin());
1581   /* First, reorder with the row permutation */
1582   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1583 
1584   /* First, solve U */
1585   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1586                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1587 
1588   /* Then, solve L */
1589   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1590                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1591 
1592   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1593   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1594 
1595   /* Copy the temporary to the full solution. */
1596   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1597 
1598   /* restore */
1599   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1600   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1601   PetscCall(PetscLogGpuTimeEnd());
1602   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1603   PetscFunctionReturn(PETSC_SUCCESS);
1604 }
1605 
1606 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1607 {
1608   const PetscScalar                 *barray;
1609   PetscScalar                       *xarray;
1610   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1611   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1612   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1613   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1614 
1615   PetscFunctionBegin;
1616   /* Analyze the matrix and create the transpose ... on the fly */
1617   if (!loTriFactorT && !upTriFactorT) {
1618     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1619     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1620     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1621   }
1622 
1623   /* Get the GPU pointers */
1624   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1625   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1626 
1627   PetscCall(PetscLogGpuTimeBegin());
1628   /* First, solve U */
1629   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1630                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1631 
1632   /* Then, solve L */
1633   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1634                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1635 
1636   /* restore */
1637   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1638   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1639   PetscCall(PetscLogGpuTimeEnd());
1640   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1641   PetscFunctionReturn(PETSC_SUCCESS);
1642 }
1643 
1644 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1645 {
1646   const PetscScalar                    *barray;
1647   PetscScalar                          *xarray;
1648   thrust::device_ptr<const PetscScalar> bGPU;
1649   thrust::device_ptr<PetscScalar>       xGPU;
1650   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1651   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1652   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1653   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1654 
1655   PetscFunctionBegin;
1656   /* Get the GPU pointers */
1657   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1658   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1659   xGPU = thrust::device_pointer_cast(xarray);
1660   bGPU = thrust::device_pointer_cast(barray);
1661 
1662   PetscCall(PetscLogGpuTimeBegin());
1663   /* First, reorder with the row permutation */
1664   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1665 
1666   /* Next, solve L */
1667   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1668                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1669 
1670   /* Then, solve U */
1671   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1672                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1673 
1674   /* Last, reorder with the column permutation */
1675   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1676 
1677   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1678   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1679   PetscCall(PetscLogGpuTimeEnd());
1680   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1681   PetscFunctionReturn(PETSC_SUCCESS);
1682 }
1683 
1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1685 {
1686   const PetscScalar                 *barray;
1687   PetscScalar                       *xarray;
1688   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1689   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1690   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1691   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1692 
1693   PetscFunctionBegin;
1694   /* Get the GPU pointers */
1695   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1696   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1697 
1698   PetscCall(PetscLogGpuTimeBegin());
1699   /* First, solve L */
1700   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1701                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1702 
1703   /* Next, solve U */
1704   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1705                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1706 
1707   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1708   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1709   PetscCall(PetscLogGpuTimeEnd());
1710   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1711   PetscFunctionReturn(PETSC_SUCCESS);
1712 }
1713 #endif
1714 
1715 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1716 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1717 {
1718   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1719   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1720   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1721   CsrMatrix                    *Acsr;
1722   PetscInt                      m, nz;
1723   PetscBool                     flg;
1724 
1725   PetscFunctionBegin;
1726   if (PetscDefined(USE_DEBUG)) {
1727     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1728     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1729   }
1730 
1731   /* Copy A's value to fact */
1732   m  = fact->rmap->n;
1733   nz = aij->nz;
1734   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1735   Acsr = (CsrMatrix *)Acusp->mat->mat;
1736   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1737 
1738   PetscCall(PetscLogGpuTimeBegin());
1739   /* Factorize fact inplace */
1740   if (m)
1741     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1742                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1743   if (PetscDefined(USE_DEBUG)) {
1744     int              numerical_zero;
1745     cusparseStatus_t status;
1746     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1747     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1748   }
1749 
1750   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1751   if (fs->updatedSpSVAnalysis) {
1752     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1754   } else
1755   #endif
1756   {
1757     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1758      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1759     */
1760     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1761 
1762     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1763 
1764     fs->updatedSpSVAnalysis = PETSC_TRUE;
1765     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1766     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1767   }
1768 
1769   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1770   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1771   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1772   fact->ops->matsolve          = NULL;
1773   fact->ops->matsolvetranspose = NULL;
1774   PetscCall(PetscLogGpuTimeEnd());
1775   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1776   PetscFunctionReturn(PETSC_SUCCESS);
1777 }
1778 
1779 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1780 {
1781   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1782   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1783   PetscInt                      m, nz;
1784 
1785   PetscFunctionBegin;
1786   if (PetscDefined(USE_DEBUG)) {
1787     PetscInt  i;
1788     PetscBool flg, missing;
1789 
1790     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1791     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1792     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1793     PetscCall(MatMissingDiagonal(A, &missing, &i));
1794     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1795   }
1796 
1797   /* Free the old stale stuff */
1798   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1799 
1800   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1801      but they will not be used. Allocate them just for easy debugging.
1802    */
1803   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1804 
1805   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1806   fact->factortype             = MAT_FACTOR_ILU;
1807   fact->info.factor_mallocs    = 0;
1808   fact->info.fill_ratio_given  = info->fill;
1809   fact->info.fill_ratio_needed = 1.0;
1810 
1811   aij->row = NULL;
1812   aij->col = NULL;
1813 
1814   /* ====================================================================== */
1815   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1816   /* We'll do in-place factorization on fact                                */
1817   /* ====================================================================== */
1818   const int *Ai, *Aj;
1819 
1820   m  = fact->rmap->n;
1821   nz = aij->nz;
1822 
1823   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1824   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1825   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1826   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1827   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1828   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1829 
1830   /* ====================================================================== */
1831   /* Create descriptors for M, L, U                                         */
1832   /* ====================================================================== */
1833   cusparseFillMode_t fillMode;
1834   cusparseDiagType_t diagType;
1835 
1836   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1837   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1838   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1839 
1840   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1841     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1842     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1843     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1844     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1845   */
1846   fillMode = CUSPARSE_FILL_MODE_LOWER;
1847   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1848   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1849   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1850   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1851 
1852   fillMode = CUSPARSE_FILL_MODE_UPPER;
1853   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1854   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1855   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1856   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1857 
1858   /* ========================================================================= */
1859   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1860   /* ========================================================================= */
1861   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1862   if (m)
1863     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1864                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1865 
1866   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1867   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1868 
1869   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1870   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1871 
1872   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1873   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1874 
1875   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1876   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1877 
1878   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1879      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1880      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1881      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1882    */
1883   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1884     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1885     fs->spsvBuffer_L = fs->factBuffer_M;
1886     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1887   } else {
1888     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1889     fs->spsvBuffer_U = fs->factBuffer_M;
1890     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1891   }
1892 
1893   /* ========================================================================== */
1894   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1895   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1896   /* ========================================================================== */
1897   int              structural_zero;
1898   cusparseStatus_t status;
1899 
1900   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1901   if (m)
1902     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1903                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1904   if (PetscDefined(USE_DEBUG)) {
1905     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1906     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1907     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1908   }
1909 
1910   /* Estimate FLOPs of the numeric factorization */
1911   {
1912     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1913     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1914     PetscLogDouble flops = 0.0;
1915 
1916     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1917     Ai    = Aseq->i;
1918     Adiag = Aseq->diag;
1919     for (PetscInt i = 0; i < m; i++) {
1920       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1921         nzRow  = Ai[i + 1] - Ai[i];
1922         nzLeft = Adiag[i] - Ai[i];
1923         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1924           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1925         */
1926         nzLeft = (nzRow - 1) / 2;
1927         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1928       }
1929     }
1930     fs->numericFactFlops = flops;
1931   }
1932   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1937 {
1938   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1940   const PetscScalar            *barray;
1941   PetscScalar                  *xarray;
1942 
1943   PetscFunctionBegin;
1944   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1945   PetscCall(VecCUDAGetArrayRead(b, &barray));
1946   PetscCall(PetscLogGpuTimeBegin());
1947 
1948   /* Solve L*y = b */
1949   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1950   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1951   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1952                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1953 
1954   /* Solve Lt*x = y */
1955   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1956   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1957                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1958 
1959   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1960   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1961 
1962   PetscCall(PetscLogGpuTimeEnd());
1963   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1964   PetscFunctionReturn(PETSC_SUCCESS);
1965 }
1966 
1967 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1968 {
1969   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1970   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1971   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1972   CsrMatrix                    *Acsr;
1973   PetscInt                      m, nz;
1974   PetscBool                     flg;
1975 
1976   PetscFunctionBegin;
1977   if (PetscDefined(USE_DEBUG)) {
1978     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1979     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1980   }
1981 
1982   /* Copy A's value to fact */
1983   m  = fact->rmap->n;
1984   nz = aij->nz;
1985   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1986   Acsr = (CsrMatrix *)Acusp->mat->mat;
1987   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1988 
1989   /* Factorize fact inplace */
1990   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1991      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1992      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1993      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1994      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1995    */
1996   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1997   if (PetscDefined(USE_DEBUG)) {
1998     int              numerical_zero;
1999     cusparseStatus_t status;
2000     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2001     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2002   }
2003 
2004   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2005   if (fs->updatedSpSVAnalysis) {
2006     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2008   } else
2009   #endif
2010   {
2011     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2012 
2013     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2014     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2015   */
2016     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2017     fs->updatedSpSVAnalysis = PETSC_TRUE;
2018   }
2019 
2020   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2021   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2022   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2023   fact->ops->matsolve          = NULL;
2024   fact->ops->matsolvetranspose = NULL;
2025   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2026   PetscFunctionReturn(PETSC_SUCCESS);
2027 }
2028 
2029 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2030 {
2031   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2032   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2033   PetscInt                      m, nz;
2034 
2035   PetscFunctionBegin;
2036   if (PetscDefined(USE_DEBUG)) {
2037     PetscInt  i;
2038     PetscBool flg, missing;
2039 
2040     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2041     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2042     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2043     PetscCall(MatMissingDiagonal(A, &missing, &i));
2044     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2045   }
2046 
2047   /* Free the old stale stuff */
2048   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2049 
2050   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2051      but they will not be used. Allocate them just for easy debugging.
2052    */
2053   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2054 
2055   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2056   fact->factortype             = MAT_FACTOR_ICC;
2057   fact->info.factor_mallocs    = 0;
2058   fact->info.fill_ratio_given  = info->fill;
2059   fact->info.fill_ratio_needed = 1.0;
2060 
2061   aij->row = NULL;
2062   aij->col = NULL;
2063 
2064   /* ====================================================================== */
2065   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2066   /* We'll do in-place factorization on fact                                */
2067   /* ====================================================================== */
2068   const int *Ai, *Aj;
2069 
2070   m  = fact->rmap->n;
2071   nz = aij->nz;
2072 
2073   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2074   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2075   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2076   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2077   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2078   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2079 
2080   /* ====================================================================== */
2081   /* Create mat descriptors for M, L                                        */
2082   /* ====================================================================== */
2083   cusparseFillMode_t fillMode;
2084   cusparseDiagType_t diagType;
2085 
2086   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2087   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2088   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2089 
2090   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2091     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2092     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2093     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2094     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2095   */
2096   fillMode = CUSPARSE_FILL_MODE_LOWER;
2097   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2098   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2099   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2100   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2101 
2102   /* ========================================================================= */
2103   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2104   /* ========================================================================= */
2105   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2106   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2107 
2108   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2109   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2110 
2111   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2112   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2113 
2114   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2115   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2116 
2117   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2118   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2119 
2120   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2121      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2122    */
2123   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2124     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2125     fs->spsvBuffer_L = fs->factBuffer_M;
2126     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2127   } else {
2128     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2129     fs->spsvBuffer_Lt = fs->factBuffer_M;
2130     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2131   }
2132 
2133   /* ========================================================================== */
2134   /* Perform analysis of ic0 on M                                               */
2135   /* The lower triangular part of M has the same sparsity pattern as L          */
2136   /* ========================================================================== */
2137   int              structural_zero;
2138   cusparseStatus_t status;
2139 
2140   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2141   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2142   if (PetscDefined(USE_DEBUG)) {
2143     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2144     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2145     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2146   }
2147 
2148   /* Estimate FLOPs of the numeric factorization */
2149   {
2150     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2151     PetscInt      *Ai, nzRow, nzLeft;
2152     PetscLogDouble flops = 0.0;
2153 
2154     Ai = Aseq->i;
2155     for (PetscInt i = 0; i < m; i++) {
2156       nzRow = Ai[i + 1] - Ai[i];
2157       if (nzRow > 1) {
2158         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2159           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2160         */
2161         nzLeft = (nzRow - 1) / 2;
2162         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2163       }
2164     }
2165     fs->numericFactFlops = flops;
2166   }
2167   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 #endif
2171 
2172 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2173 {
2174   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2175   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2176 
2177   PetscFunctionBegin;
2178   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2179   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2180   B->offloadmask = PETSC_OFFLOAD_CPU;
2181 
2182   if (!cusparsestruct->use_cpu_solve) {
2183 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2184     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2185     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2186 #else
2187     /* determine which version of MatSolve needs to be used. */
2188     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2189     IS          isrow = b->row, iscol = b->col;
2190     PetscBool   row_identity, col_identity;
2191 
2192     PetscCall(ISIdentity(isrow, &row_identity));
2193     PetscCall(ISIdentity(iscol, &col_identity));
2194     if (row_identity && col_identity) {
2195       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2196       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2197     } else {
2198       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2199       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2200     }
2201 #endif
2202   }
2203   B->ops->matsolve          = NULL;
2204   B->ops->matsolvetranspose = NULL;
2205 
2206   /* get the triangular factors */
2207   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2208   PetscFunctionReturn(PETSC_SUCCESS);
2209 }
2210 
2211 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2212 {
2213   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2214 
2215   PetscFunctionBegin;
2216   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219   PetscFunctionReturn(PETSC_SUCCESS);
2220 }
2221 
2222 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2223 {
2224   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2225 
2226   PetscFunctionBegin;
2227 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2229   if (!info->factoronhost) {
2230     PetscCall(ISIdentity(isrow, &row_identity));
2231     PetscCall(ISIdentity(iscol, &col_identity));
2232   }
2233   if (!info->levels && row_identity && col_identity) {
2234     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2235   } else
2236 #endif
2237   {
2238     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2239     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2240     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2241   }
2242   PetscFunctionReturn(PETSC_SUCCESS);
2243 }
2244 
2245 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2246 {
2247   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2248 
2249   PetscFunctionBegin;
2250 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2251   PetscBool perm_identity = PETSC_FALSE;
2252   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2253   if (!info->levels && perm_identity) {
2254     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2255   } else
2256 #endif
2257   {
2258     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2259     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2260     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2261   }
2262   PetscFunctionReturn(PETSC_SUCCESS);
2263 }
2264 
2265 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2266 {
2267   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2268 
2269   PetscFunctionBegin;
2270   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2271   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2272   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2273   PetscFunctionReturn(PETSC_SUCCESS);
2274 }
2275 
2276 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2277 {
2278   PetscFunctionBegin;
2279   *type = MATSOLVERCUSPARSE;
2280   PetscFunctionReturn(PETSC_SUCCESS);
2281 }
2282 
2283 /*MC
2284   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2285   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2286   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2287   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2288   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2289   algorithms are not recommended. This class does NOT support direct solver operations.
2290 
2291   Level: beginner
2292 
2293 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2294           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2295 M*/
2296 
2297 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2298 {
2299   PetscInt n = A->rmap->n;
2300 
2301   PetscFunctionBegin;
2302   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2303   PetscCall(MatSetSizes(*B, n, n, n, n));
2304   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2305   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2306 
2307   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2308   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2309     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2310     if (!A->boundtocpu) {
2311       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2312       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2313     } else {
2314       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2315       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2316     }
2317     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2318     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2319     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2320   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2321     if (!A->boundtocpu) {
2322       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2323       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2324     } else {
2325       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2326       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2327     }
2328     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2329     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2330   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2331 
2332   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2333   (*B)->canuseordering = PETSC_TRUE;
2334   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2335   PetscFunctionReturn(PETSC_SUCCESS);
2336 }
2337 
2338 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2339 {
2340   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2341   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2342 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2343   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2344 #endif
2345 
2346   PetscFunctionBegin;
2347   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2348     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2349     if (A->factortype == MAT_FACTOR_NONE) {
2350       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2351       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2352     }
2353 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2354     else if (fs->csrVal) {
2355       /* We have a factorized matrix on device and are able to copy it to host */
2356       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2357     }
2358 #endif
2359     else
2360       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2361     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2362     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2363     A->offloadmask = PETSC_OFFLOAD_BOTH;
2364   }
2365   PetscFunctionReturn(PETSC_SUCCESS);
2366 }
2367 
2368 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2369 {
2370   PetscFunctionBegin;
2371   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2372   *array = ((Mat_SeqAIJ *)A->data)->a;
2373   PetscFunctionReturn(PETSC_SUCCESS);
2374 }
2375 
2376 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377 {
2378   PetscFunctionBegin;
2379   A->offloadmask = PETSC_OFFLOAD_CPU;
2380   *array         = NULL;
2381   PetscFunctionReturn(PETSC_SUCCESS);
2382 }
2383 
2384 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2385 {
2386   PetscFunctionBegin;
2387   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2388   *array = ((Mat_SeqAIJ *)A->data)->a;
2389   PetscFunctionReturn(PETSC_SUCCESS);
2390 }
2391 
2392 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2393 {
2394   PetscFunctionBegin;
2395   *array = NULL;
2396   PetscFunctionReturn(PETSC_SUCCESS);
2397 }
2398 
2399 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2400 {
2401   PetscFunctionBegin;
2402   *array = ((Mat_SeqAIJ *)A->data)->a;
2403   PetscFunctionReturn(PETSC_SUCCESS);
2404 }
2405 
2406 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2407 {
2408   PetscFunctionBegin;
2409   A->offloadmask = PETSC_OFFLOAD_CPU;
2410   *array         = NULL;
2411   PetscFunctionReturn(PETSC_SUCCESS);
2412 }
2413 
2414 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2415 {
2416   Mat_SeqAIJCUSPARSE *cusp;
2417   CsrMatrix          *matrix;
2418 
2419   PetscFunctionBegin;
2420   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2421   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2422   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2423   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2424   matrix = (CsrMatrix *)cusp->mat->mat;
2425 
2426   if (i) {
2427 #if !defined(PETSC_USE_64BIT_INDICES)
2428     *i = matrix->row_offsets->data().get();
2429 #else
2430     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2431 #endif
2432   }
2433   if (j) {
2434 #if !defined(PETSC_USE_64BIT_INDICES)
2435     *j = matrix->column_indices->data().get();
2436 #else
2437     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2438 #endif
2439   }
2440   if (a) *a = matrix->values->data().get();
2441   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2442   PetscFunctionReturn(PETSC_SUCCESS);
2443 }
2444 
2445 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2446 {
2447   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2448   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2449   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2450   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2451   cusparseStatus_t              stat;
2452   PetscBool                     both = PETSC_TRUE;
2453 
2454   PetscFunctionBegin;
2455   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2456   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2457     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2458       CsrMatrix *matrix;
2459       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2460 
2461       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2462       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2463       matrix->values->assign(a->a, a->a + a->nz);
2464       PetscCallCUDA(WaitForCUDA());
2465       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2466       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2467       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2468     } else {
2469       PetscInt nnz;
2470       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2471       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2472       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2473       delete cusparsestruct->workVector;
2474       delete cusparsestruct->rowoffsets_gpu;
2475       cusparsestruct->workVector     = NULL;
2476       cusparsestruct->rowoffsets_gpu = NULL;
2477       try {
2478         if (a->compressedrow.use) {
2479           m    = a->compressedrow.nrows;
2480           ii   = a->compressedrow.i;
2481           ridx = a->compressedrow.rindex;
2482         } else {
2483           m    = A->rmap->n;
2484           ii   = a->i;
2485           ridx = NULL;
2486         }
2487         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2488         if (!a->a) {
2489           nnz  = ii[m];
2490           both = PETSC_FALSE;
2491         } else nnz = a->nz;
2492         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2493 
2494         /* create cusparse matrix */
2495         cusparsestruct->nrows = m;
2496         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2497         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2498         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2499         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2500 
2501         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2502         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2503         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2504         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2506         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2507         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2508 
2509         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2510         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2511           /* set the matrix */
2512           CsrMatrix *mat   = new CsrMatrix;
2513           mat->num_rows    = m;
2514           mat->num_cols    = A->cmap->n;
2515           mat->num_entries = nnz;
2516           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2517           mat->row_offsets->assign(ii, ii + m + 1);
2518 
2519           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2520           mat->column_indices->assign(a->j, a->j + nnz);
2521 
2522           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2523           if (a->a) mat->values->assign(a->a, a->a + nnz);
2524 
2525           /* assign the pointer */
2526           matstruct->mat = mat;
2527 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2528           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2529             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2530                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2531             PetscCallCUSPARSE(stat);
2532           }
2533 #endif
2534         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2535 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2536           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2537 #else
2538           CsrMatrix *mat   = new CsrMatrix;
2539           mat->num_rows    = m;
2540           mat->num_cols    = A->cmap->n;
2541           mat->num_entries = nnz;
2542           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2543           mat->row_offsets->assign(ii, ii + m + 1);
2544 
2545           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2546           mat->column_indices->assign(a->j, a->j + nnz);
2547 
2548           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2549           if (a->a) mat->values->assign(a->a, a->a + nnz);
2550 
2551           cusparseHybMat_t hybMat;
2552           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2553           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2554           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2555           PetscCallCUSPARSE(stat);
2556           /* assign the pointer */
2557           matstruct->mat = hybMat;
2558 
2559           if (mat) {
2560             if (mat->values) delete (THRUSTARRAY *)mat->values;
2561             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2562             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2563             delete (CsrMatrix *)mat;
2564           }
2565 #endif
2566         }
2567 
2568         /* assign the compressed row indices */
2569         if (a->compressedrow.use) {
2570           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2571           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2572           matstruct->cprowIndices->assign(ridx, ridx + m);
2573           tmp = m;
2574         } else {
2575           cusparsestruct->workVector = NULL;
2576           matstruct->cprowIndices    = NULL;
2577           tmp                        = 0;
2578         }
2579         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2580 
2581         /* assign the pointer */
2582         cusparsestruct->mat = matstruct;
2583       } catch (char *ex) {
2584         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2585       }
2586       PetscCallCUDA(WaitForCUDA());
2587       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2588       cusparsestruct->nonzerostate = A->nonzerostate;
2589     }
2590     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2591   }
2592   PetscFunctionReturn(PETSC_SUCCESS);
2593 }
2594 
2595 struct VecCUDAPlusEquals {
2596   template <typename Tuple>
2597   __host__ __device__ void operator()(Tuple t)
2598   {
2599     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2600   }
2601 };
2602 
2603 struct VecCUDAEquals {
2604   template <typename Tuple>
2605   __host__ __device__ void operator()(Tuple t)
2606   {
2607     thrust::get<1>(t) = thrust::get<0>(t);
2608   }
2609 };
2610 
2611 struct VecCUDAEqualsReverse {
2612   template <typename Tuple>
2613   __host__ __device__ void operator()(Tuple t)
2614   {
2615     thrust::get<0>(t) = thrust::get<1>(t);
2616   }
2617 };
2618 
2619 struct MatMatCusparse {
2620   PetscBool      cisdense;
2621   PetscScalar   *Bt;
2622   Mat            X;
2623   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2624   PetscLogDouble flops;
2625   CsrMatrix     *Bcsr;
2626 
2627 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2628   cusparseSpMatDescr_t matSpBDescr;
2629   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2630   cusparseDnMatDescr_t matBDescr;
2631   cusparseDnMatDescr_t matCDescr;
2632   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2633   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634   void *dBuffer4;
2635   void *dBuffer5;
2636   #endif
2637   size_t                mmBufferSize;
2638   void                 *mmBuffer;
2639   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2640   cusparseSpGEMMDescr_t spgemmDesc;
2641 #endif
2642 };
2643 
2644 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2645 {
2646   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2647 
2648   PetscFunctionBegin;
2649   PetscCallCUDA(cudaFree(mmdata->Bt));
2650   delete mmdata->Bcsr;
2651 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2652   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2653   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2654   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2655   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2656   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2657   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2658   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2659   #endif
2660   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2661   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2662 #endif
2663   PetscCall(MatDestroy(&mmdata->X));
2664   PetscCall(PetscFree(data));
2665   PetscFunctionReturn(PETSC_SUCCESS);
2666 }
2667 
2668 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2669 
2670 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2671 {
2672   Mat_Product                  *product = C->product;
2673   Mat                           A, B;
2674   PetscInt                      m, n, blda, clda;
2675   PetscBool                     flg, biscuda;
2676   Mat_SeqAIJCUSPARSE           *cusp;
2677   cusparseStatus_t              stat;
2678   cusparseOperation_t           opA;
2679   const PetscScalar            *barray;
2680   PetscScalar                  *carray;
2681   MatMatCusparse               *mmdata;
2682   Mat_SeqAIJCUSPARSEMultStruct *mat;
2683   CsrMatrix                    *csrmat;
2684 
2685   PetscFunctionBegin;
2686   MatCheckProduct(C, 1);
2687   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2688   mmdata = (MatMatCusparse *)product->data;
2689   A      = product->A;
2690   B      = product->B;
2691   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2692   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2693   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2694      Instead of silently accepting the wrong answer, I prefer to raise the error */
2695   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2696   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2697   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2698   switch (product->type) {
2699   case MATPRODUCT_AB:
2700   case MATPRODUCT_PtAP:
2701     mat = cusp->mat;
2702     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2703     m   = A->rmap->n;
2704     n   = B->cmap->n;
2705     break;
2706   case MATPRODUCT_AtB:
2707     if (!A->form_explicit_transpose) {
2708       mat = cusp->mat;
2709       opA = CUSPARSE_OPERATION_TRANSPOSE;
2710     } else {
2711       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2712       mat = cusp->matTranspose;
2713       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2714     }
2715     m = A->cmap->n;
2716     n = B->cmap->n;
2717     break;
2718   case MATPRODUCT_ABt:
2719   case MATPRODUCT_RARt:
2720     mat = cusp->mat;
2721     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2722     m   = A->rmap->n;
2723     n   = B->rmap->n;
2724     break;
2725   default:
2726     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2727   }
2728   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2729   csrmat = (CsrMatrix *)mat->mat;
2730   /* if the user passed a CPU matrix, copy the data to the GPU */
2731   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2732   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2733   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2734 
2735   PetscCall(MatDenseGetLDA(B, &blda));
2736   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2737     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2738     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2739   } else {
2740     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2741     PetscCall(MatDenseGetLDA(C, &clda));
2742   }
2743 
2744   PetscCall(PetscLogGpuTimeBegin());
2745 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2746   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2747   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2748   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2749   #else
2750   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2751   #endif
2752 
2753   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2754   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2755     size_t mmBufferSize;
2756     if (mmdata->initialized && mmdata->Blda != blda) {
2757       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2758       mmdata->matBDescr = NULL;
2759     }
2760     if (!mmdata->matBDescr) {
2761       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2762       mmdata->Blda = blda;
2763     }
2764 
2765     if (mmdata->initialized && mmdata->Clda != clda) {
2766       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2767       mmdata->matCDescr = NULL;
2768     }
2769     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2770       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2771       mmdata->Clda = clda;
2772     }
2773 
2774   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2775     if (matADescr) {
2776       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2777       matADescr = NULL;
2778     }
2779   #endif
2780 
2781     if (!matADescr) {
2782       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2783                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2784       PetscCallCUSPARSE(stat);
2785     }
2786 
2787     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2788 
2789     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2790       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2791       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2792       mmdata->mmBufferSize = mmBufferSize;
2793     }
2794 
2795   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2796     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2797   #endif
2798 
2799     mmdata->initialized = PETSC_TRUE;
2800   } else {
2801     /* to be safe, always update pointers of the mats */
2802     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2803     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2804     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2805   }
2806 
2807   /* do cusparseSpMM, which supports transpose on B */
2808   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2809 #else
2810   PetscInt k;
2811   /* cusparseXcsrmm does not support transpose on B */
2812   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2813     cublasHandle_t cublasv2handle;
2814     cublasStatus_t cerr;
2815 
2816     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2817     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2818     PetscCallCUBLAS(cerr);
2819     blda = B->cmap->n;
2820     k    = B->cmap->n;
2821   } else {
2822     k = B->rmap->n;
2823   }
2824 
2825   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2826   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2827   PetscCallCUSPARSE(stat);
2828 #endif
2829   PetscCall(PetscLogGpuTimeEnd());
2830   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2831   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2832   if (product->type == MATPRODUCT_RARt) {
2833     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2834     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2835   } else if (product->type == MATPRODUCT_PtAP) {
2836     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2837     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2838   } else {
2839     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2840   }
2841   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2842   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2843   PetscFunctionReturn(PETSC_SUCCESS);
2844 }
2845 
2846 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2847 {
2848   Mat_Product        *product = C->product;
2849   Mat                 A, B;
2850   PetscInt            m, n;
2851   PetscBool           cisdense, flg;
2852   MatMatCusparse     *mmdata;
2853   Mat_SeqAIJCUSPARSE *cusp;
2854 
2855   PetscFunctionBegin;
2856   MatCheckProduct(C, 1);
2857   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2858   A = product->A;
2859   B = product->B;
2860   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2861   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2862   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2863   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2864   switch (product->type) {
2865   case MATPRODUCT_AB:
2866     m = A->rmap->n;
2867     n = B->cmap->n;
2868     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2869     break;
2870   case MATPRODUCT_AtB:
2871     m = A->cmap->n;
2872     n = B->cmap->n;
2873     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2874     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2875     break;
2876   case MATPRODUCT_ABt:
2877     m = A->rmap->n;
2878     n = B->rmap->n;
2879     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2880     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2881     break;
2882   case MATPRODUCT_PtAP:
2883     m = B->cmap->n;
2884     n = B->cmap->n;
2885     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2886     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2887     break;
2888   case MATPRODUCT_RARt:
2889     m = B->rmap->n;
2890     n = B->rmap->n;
2891     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2892     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2893     break;
2894   default:
2895     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2896   }
2897   PetscCall(MatSetSizes(C, m, n, m, n));
2898   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2899   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2900   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2901 
2902   /* product data */
2903   PetscCall(PetscNew(&mmdata));
2904   mmdata->cisdense = cisdense;
2905 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2906   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2907   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2908 #endif
2909   /* for these products we need intermediate storage */
2910   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2911     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2912     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2913     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2914       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2915     } else {
2916       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2917     }
2918   }
2919   C->product->data    = mmdata;
2920   C->product->destroy = MatDestroy_MatMatCusparse;
2921 
2922   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2923   PetscFunctionReturn(PETSC_SUCCESS);
2924 }
2925 
2926 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2927 {
2928   Mat_Product                  *product = C->product;
2929   Mat                           A, B;
2930   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2931   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2932   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2933   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2934   PetscBool                     flg;
2935   cusparseStatus_t              stat;
2936   MatProductType                ptype;
2937   MatMatCusparse               *mmdata;
2938 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2939   cusparseSpMatDescr_t BmatSpDescr;
2940 #endif
2941   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2942 
2943   PetscFunctionBegin;
2944   MatCheckProduct(C, 1);
2945   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2946   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2947   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2948   mmdata = (MatMatCusparse *)C->product->data;
2949   A      = product->A;
2950   B      = product->B;
2951   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2952     mmdata->reusesym = PETSC_FALSE;
2953     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2954     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2955     Cmat = Ccusp->mat;
2956     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2957     Ccsr = (CsrMatrix *)Cmat->mat;
2958     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2959     goto finalize;
2960   }
2961   if (!c->nz) goto finalize;
2962   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2963   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2964   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2965   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2966   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2967   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2968   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2969   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2970   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2971   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2973   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2974   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2975   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2976 
2977   ptype = product->type;
2978   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2979     ptype = MATPRODUCT_AB;
2980     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2981   }
2982   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2983     ptype = MATPRODUCT_AB;
2984     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2985   }
2986   switch (ptype) {
2987   case MATPRODUCT_AB:
2988     Amat = Acusp->mat;
2989     Bmat = Bcusp->mat;
2990     break;
2991   case MATPRODUCT_AtB:
2992     Amat = Acusp->matTranspose;
2993     Bmat = Bcusp->mat;
2994     break;
2995   case MATPRODUCT_ABt:
2996     Amat = Acusp->mat;
2997     Bmat = Bcusp->matTranspose;
2998     break;
2999   default:
3000     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3001   }
3002   Cmat = Ccusp->mat;
3003   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3004   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3005   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3006   Acsr = (CsrMatrix *)Amat->mat;
3007   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3008   Ccsr = (CsrMatrix *)Cmat->mat;
3009   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3010   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3011   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3012   PetscCall(PetscLogGpuTimeBegin());
3013 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3014   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3015   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3016   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3017   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3018   PetscCallCUSPARSE(stat);
3019   #else
3020   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3021   PetscCallCUSPARSE(stat);
3022   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3023   PetscCallCUSPARSE(stat);
3024   #endif
3025 #else
3026   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3027                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3028   PetscCallCUSPARSE(stat);
3029 #endif
3030   PetscCall(PetscLogGpuFlops(mmdata->flops));
3031   PetscCallCUDA(WaitForCUDA());
3032   PetscCall(PetscLogGpuTimeEnd());
3033   C->offloadmask = PETSC_OFFLOAD_GPU;
3034 finalize:
3035   /* shorter version of MatAssemblyEnd_SeqAIJ */
3036   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3037   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3038   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3039   c->reallocs = 0;
3040   C->info.mallocs += 0;
3041   C->info.nz_unneeded = 0;
3042   C->assembled = C->was_assembled = PETSC_TRUE;
3043   C->num_ass++;
3044   PetscFunctionReturn(PETSC_SUCCESS);
3045 }
3046 
3047 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3048 {
3049   Mat_Product                  *product = C->product;
3050   Mat                           A, B;
3051   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3052   Mat_SeqAIJ                   *a, *b, *c;
3053   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3054   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3055   PetscInt                      i, j, m, n, k;
3056   PetscBool                     flg;
3057   cusparseStatus_t              stat;
3058   MatProductType                ptype;
3059   MatMatCusparse               *mmdata;
3060   PetscLogDouble                flops;
3061   PetscBool                     biscompressed, ciscompressed;
3062 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3063   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3064   cusparseSpMatDescr_t BmatSpDescr;
3065 #else
3066   int cnz;
3067 #endif
3068   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3069 
3070   PetscFunctionBegin;
3071   MatCheckProduct(C, 1);
3072   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3073   A = product->A;
3074   B = product->B;
3075   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3076   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3077   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3078   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3079   a = (Mat_SeqAIJ *)A->data;
3080   b = (Mat_SeqAIJ *)B->data;
3081   /* product data */
3082   PetscCall(PetscNew(&mmdata));
3083   C->product->data    = mmdata;
3084   C->product->destroy = MatDestroy_MatMatCusparse;
3085 
3086   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3087   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3088   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3089   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3090   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3091   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3092 
3093   ptype = product->type;
3094   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3095     ptype                                          = MATPRODUCT_AB;
3096     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3097   }
3098   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3099     ptype                                          = MATPRODUCT_AB;
3100     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3101   }
3102   biscompressed = PETSC_FALSE;
3103   ciscompressed = PETSC_FALSE;
3104   switch (ptype) {
3105   case MATPRODUCT_AB:
3106     m    = A->rmap->n;
3107     n    = B->cmap->n;
3108     k    = A->cmap->n;
3109     Amat = Acusp->mat;
3110     Bmat = Bcusp->mat;
3111     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3112     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3113     break;
3114   case MATPRODUCT_AtB:
3115     m = A->cmap->n;
3116     n = B->cmap->n;
3117     k = A->rmap->n;
3118     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3119     Amat = Acusp->matTranspose;
3120     Bmat = Bcusp->mat;
3121     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3122     break;
3123   case MATPRODUCT_ABt:
3124     m = A->rmap->n;
3125     n = B->rmap->n;
3126     k = A->cmap->n;
3127     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3128     Amat = Acusp->mat;
3129     Bmat = Bcusp->matTranspose;
3130     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3131     break;
3132   default:
3133     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3134   }
3135 
3136   /* create cusparse matrix */
3137   PetscCall(MatSetSizes(C, m, n, m, n));
3138   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3139   c     = (Mat_SeqAIJ *)C->data;
3140   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3141   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3142   Ccsr  = new CsrMatrix;
3143 
3144   c->compressedrow.use = ciscompressed;
3145   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3146     c->compressedrow.nrows = a->compressedrow.nrows;
3147     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3148     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3149     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3150     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3151     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3152   } else {
3153     c->compressedrow.nrows  = 0;
3154     c->compressedrow.i      = NULL;
3155     c->compressedrow.rindex = NULL;
3156     Ccusp->workVector       = NULL;
3157     Cmat->cprowIndices      = NULL;
3158   }
3159   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3160   Ccusp->mat        = Cmat;
3161   Ccusp->mat->mat   = Ccsr;
3162   Ccsr->num_rows    = Ccusp->nrows;
3163   Ccsr->num_cols    = n;
3164   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3165   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3166   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3167   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3168   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3169   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3170   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3171   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3173   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3174   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3175     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3176     c->nz                = 0;
3177     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3178     Ccsr->values         = new THRUSTARRAY(c->nz);
3179     goto finalizesym;
3180   }
3181 
3182   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3183   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3184   Acsr = (CsrMatrix *)Amat->mat;
3185   if (!biscompressed) {
3186     Bcsr = (CsrMatrix *)Bmat->mat;
3187 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3188     BmatSpDescr = Bmat->matDescr;
3189 #endif
3190   } else { /* we need to use row offsets for the full matrix */
3191     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3192     Bcsr                 = new CsrMatrix;
3193     Bcsr->num_rows       = B->rmap->n;
3194     Bcsr->num_cols       = cBcsr->num_cols;
3195     Bcsr->num_entries    = cBcsr->num_entries;
3196     Bcsr->column_indices = cBcsr->column_indices;
3197     Bcsr->values         = cBcsr->values;
3198     if (!Bcusp->rowoffsets_gpu) {
3199       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3200       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3201       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3202     }
3203     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3204     mmdata->Bcsr      = Bcsr;
3205 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3206     if (Bcsr->num_rows && Bcsr->num_cols) {
3207       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3208       PetscCallCUSPARSE(stat);
3209     }
3210     BmatSpDescr = mmdata->matSpBDescr;
3211 #endif
3212   }
3213   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3214   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3215   /* precompute flops count */
3216   if (ptype == MATPRODUCT_AB) {
3217     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3218       const PetscInt st = a->i[i];
3219       const PetscInt en = a->i[i + 1];
3220       for (j = st; j < en; j++) {
3221         const PetscInt brow = a->j[j];
3222         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3223       }
3224     }
3225   } else if (ptype == MATPRODUCT_AtB) {
3226     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3227       const PetscInt anzi = a->i[i + 1] - a->i[i];
3228       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3229       flops += (2. * anzi) * bnzi;
3230     }
3231   } else { /* TODO */
3232     flops = 0.;
3233   }
3234 
3235   mmdata->flops = flops;
3236   PetscCall(PetscLogGpuTimeBegin());
3237 
3238 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3239   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3240   // cuda-12.2 requires non-null csrRowOffsets
3241   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3242   PetscCallCUSPARSE(stat);
3243   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3244   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3245   {
3246     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3247      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3248   */
3249     void *dBuffer1 = NULL;
3250     void *dBuffer2 = NULL;
3251     void *dBuffer3 = NULL;
3252     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3253     size_t bufferSize1 = 0;
3254     size_t bufferSize2 = 0;
3255     size_t bufferSize3 = 0;
3256     size_t bufferSize4 = 0;
3257     size_t bufferSize5 = 0;
3258 
3259     /* ask bufferSize1 bytes for external memory */
3260     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3261     PetscCallCUSPARSE(stat);
3262     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3263     /* inspect the matrices A and B to understand the memory requirement for the next step */
3264     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3265     PetscCallCUSPARSE(stat);
3266 
3267     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3268     PetscCallCUSPARSE(stat);
3269     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3270     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3271     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3272     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3273     PetscCallCUSPARSE(stat);
3274     PetscCallCUDA(cudaFree(dBuffer1));
3275     PetscCallCUDA(cudaFree(dBuffer2));
3276 
3277     /* get matrix C non-zero entries C_nnz1 */
3278     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3279     c->nz = (PetscInt)C_nnz1;
3280     /* allocate matrix C */
3281     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3282     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3283     Ccsr->values = new THRUSTARRAY(c->nz);
3284     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3285     /* update matC with the new pointers */
3286     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3287     PetscCallCUSPARSE(stat);
3288 
3289     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3290     PetscCallCUSPARSE(stat);
3291     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3292     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3293     PetscCallCUSPARSE(stat);
3294     PetscCallCUDA(cudaFree(dBuffer3));
3295     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3296     PetscCallCUSPARSE(stat);
3297     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3298   }
3299   #else
3300   size_t bufSize2;
3301   /* ask bufferSize bytes for external memory */
3302   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3303   PetscCallCUSPARSE(stat);
3304   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3305   /* inspect the matrices A and B to understand the memory requirement for the next step */
3306   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3307   PetscCallCUSPARSE(stat);
3308   /* ask bufferSize again bytes for external memory */
3309   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3310   PetscCallCUSPARSE(stat);
3311   /* The CUSPARSE documentation is not clear, nor the API
3312      We need both buffers to perform the operations properly!
3313      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3314      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3315      is stored in the descriptor! What a messy API... */
3316   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3317   /* compute the intermediate product of A * B */
3318   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3319   PetscCallCUSPARSE(stat);
3320   /* get matrix C non-zero entries C_nnz1 */
3321   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3322   c->nz = (PetscInt)C_nnz1;
3323   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3324                       mmdata->mmBufferSize / 1024));
3325   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3326   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3327   Ccsr->values = new THRUSTARRAY(c->nz);
3328   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3329   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3330   PetscCallCUSPARSE(stat);
3331   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3332   PetscCallCUSPARSE(stat);
3333   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3334 #else
3335   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3336   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3337                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3338   PetscCallCUSPARSE(stat);
3339   c->nz                = cnz;
3340   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3341   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3342   Ccsr->values = new THRUSTARRAY(c->nz);
3343   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3344 
3345   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3346   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3347      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3348      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3349   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3350                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3351   PetscCallCUSPARSE(stat);
3352 #endif
3353   PetscCall(PetscLogGpuFlops(mmdata->flops));
3354   PetscCall(PetscLogGpuTimeEnd());
3355 finalizesym:
3356   c->free_a = PETSC_TRUE;
3357   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3358   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3359   c->free_ij = PETSC_TRUE;
3360   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3361     PetscInt      *d_i = c->i;
3362     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3363     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3364     ii = *Ccsr->row_offsets;
3365     jj = *Ccsr->column_indices;
3366     if (ciscompressed) d_i = c->compressedrow.i;
3367     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3368     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3369   } else {
3370     PetscInt *d_i = c->i;
3371     if (ciscompressed) d_i = c->compressedrow.i;
3372     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3373     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3374   }
3375   if (ciscompressed) { /* need to expand host row offsets */
3376     PetscInt r = 0;
3377     c->i[0]    = 0;
3378     for (k = 0; k < c->compressedrow.nrows; k++) {
3379       const PetscInt next = c->compressedrow.rindex[k];
3380       const PetscInt old  = c->compressedrow.i[k];
3381       for (; r < next; r++) c->i[r + 1] = old;
3382     }
3383     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3384   }
3385   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3386   PetscCall(PetscMalloc1(m, &c->ilen));
3387   PetscCall(PetscMalloc1(m, &c->imax));
3388   c->maxnz         = c->nz;
3389   c->nonzerorowcnt = 0;
3390   c->rmax          = 0;
3391   for (k = 0; k < m; k++) {
3392     const PetscInt nn = c->i[k + 1] - c->i[k];
3393     c->ilen[k] = c->imax[k] = nn;
3394     c->nonzerorowcnt += (PetscInt)!!nn;
3395     c->rmax = PetscMax(c->rmax, nn);
3396   }
3397   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3398   PetscCall(PetscMalloc1(c->nz, &c->a));
3399   Ccsr->num_entries = c->nz;
3400 
3401   C->nonzerostate++;
3402   PetscCall(PetscLayoutSetUp(C->rmap));
3403   PetscCall(PetscLayoutSetUp(C->cmap));
3404   Ccusp->nonzerostate = C->nonzerostate;
3405   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3406   C->preallocated     = PETSC_TRUE;
3407   C->assembled        = PETSC_FALSE;
3408   C->was_assembled    = PETSC_FALSE;
3409   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3410     mmdata->reusesym = PETSC_TRUE;
3411     C->offloadmask   = PETSC_OFFLOAD_GPU;
3412   }
3413   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3414   PetscFunctionReturn(PETSC_SUCCESS);
3415 }
3416 
3417 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3418 
3419 /* handles sparse or dense B */
3420 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3421 {
3422   Mat_Product *product = mat->product;
3423   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3424 
3425   PetscFunctionBegin;
3426   MatCheckProduct(mat, 1);
3427   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3428   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3429   if (product->type == MATPRODUCT_ABC) {
3430     Ciscusp = PETSC_FALSE;
3431     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3432   }
3433   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3434     PetscBool usecpu = PETSC_FALSE;
3435     switch (product->type) {
3436     case MATPRODUCT_AB:
3437       if (product->api_user) {
3438         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3439         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3440         PetscOptionsEnd();
3441       } else {
3442         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3443         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3444         PetscOptionsEnd();
3445       }
3446       break;
3447     case MATPRODUCT_AtB:
3448       if (product->api_user) {
3449         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3450         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3451         PetscOptionsEnd();
3452       } else {
3453         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3454         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3455         PetscOptionsEnd();
3456       }
3457       break;
3458     case MATPRODUCT_PtAP:
3459       if (product->api_user) {
3460         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3461         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3462         PetscOptionsEnd();
3463       } else {
3464         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3465         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3466         PetscOptionsEnd();
3467       }
3468       break;
3469     case MATPRODUCT_RARt:
3470       if (product->api_user) {
3471         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3472         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3473         PetscOptionsEnd();
3474       } else {
3475         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3476         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3477         PetscOptionsEnd();
3478       }
3479       break;
3480     case MATPRODUCT_ABC:
3481       if (product->api_user) {
3482         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3483         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3484         PetscOptionsEnd();
3485       } else {
3486         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3487         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3488         PetscOptionsEnd();
3489       }
3490       break;
3491     default:
3492       break;
3493     }
3494     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3495   }
3496   /* dispatch */
3497   if (isdense) {
3498     switch (product->type) {
3499     case MATPRODUCT_AB:
3500     case MATPRODUCT_AtB:
3501     case MATPRODUCT_ABt:
3502     case MATPRODUCT_PtAP:
3503     case MATPRODUCT_RARt:
3504       if (product->A->boundtocpu) {
3505         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3506       } else {
3507         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3508       }
3509       break;
3510     case MATPRODUCT_ABC:
3511       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3512       break;
3513     default:
3514       break;
3515     }
3516   } else if (Biscusp && Ciscusp) {
3517     switch (product->type) {
3518     case MATPRODUCT_AB:
3519     case MATPRODUCT_AtB:
3520     case MATPRODUCT_ABt:
3521       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3522       break;
3523     case MATPRODUCT_PtAP:
3524     case MATPRODUCT_RARt:
3525     case MATPRODUCT_ABC:
3526       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3527       break;
3528     default:
3529       break;
3530     }
3531   } else { /* fallback for AIJ */
3532     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3533   }
3534   PetscFunctionReturn(PETSC_SUCCESS);
3535 }
3536 
3537 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538 {
3539   PetscFunctionBegin;
3540   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3541   PetscFunctionReturn(PETSC_SUCCESS);
3542 }
3543 
3544 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3545 {
3546   PetscFunctionBegin;
3547   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3548   PetscFunctionReturn(PETSC_SUCCESS);
3549 }
3550 
3551 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3552 {
3553   PetscFunctionBegin;
3554   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3555   PetscFunctionReturn(PETSC_SUCCESS);
3556 }
3557 
3558 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3559 {
3560   PetscFunctionBegin;
3561   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3562   PetscFunctionReturn(PETSC_SUCCESS);
3563 }
3564 
3565 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3566 {
3567   PetscFunctionBegin;
3568   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3569   PetscFunctionReturn(PETSC_SUCCESS);
3570 }
3571 
3572 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3573 {
3574   int i = blockIdx.x * blockDim.x + threadIdx.x;
3575   if (i < n) y[idx[i]] += x[i];
3576 }
3577 
3578 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3579 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3580 {
3581   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3582   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3583   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3584   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3585   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3586   PetscBool                     compressed;
3587 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3588   PetscInt nx, ny;
3589 #endif
3590 
3591   PetscFunctionBegin;
3592   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3593   if (!a->nz) {
3594     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3595     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3596     PetscFunctionReturn(PETSC_SUCCESS);
3597   }
3598   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3599   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3600   if (!trans) {
3601     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3602     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3603   } else {
3604     if (herm || !A->form_explicit_transpose) {
3605       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3606       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3607     } else {
3608       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3609       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3610     }
3611   }
3612   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3613   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3614 
3615   try {
3616     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3617     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3618     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3619 
3620     PetscCall(PetscLogGpuTimeBegin());
3621     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3622       /* z = A x + beta y.
3623          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3624          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3625       */
3626       xptr = xarray;
3627       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3628       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3631           allocated to accommodate different uses. So we get the length info directly from mat.
3632        */
3633       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3635         nx             = mat->num_cols; // since y = Ax
3636         ny             = mat->num_rows;
3637       }
3638 #endif
3639     } else {
3640       /* z = A^T x + beta y
3641          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3642          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3643        */
3644       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3645       dptr = zarray;
3646       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3647       if (compressed) { /* Scatter x to work vector */
3648         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3649 
3650         thrust::for_each(
3651 #if PetscDefined(HAVE_THRUST_ASYNC)
3652           thrust::cuda::par.on(PetscDefaultCudaStream),
3653 #endif
3654           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3655           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3656       }
3657 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3658       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3659         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3660         nx             = mat->num_rows; // since y = A^T x
3661         ny             = mat->num_cols;
3662       }
3663 #endif
3664     }
3665 
3666     /* csr_spmv does y = alpha op(A) x + beta y */
3667     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3668 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3669   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3670       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3671   #else
3672       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3673   #endif
3674 
3675       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3676   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3677       if (!matDescr) {
3678         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3679         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3680       }
3681   #endif
3682 
3683       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3684         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3685         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3686         PetscCallCUSPARSE(
3687           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3688         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3689   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3690         PetscCallCUSPARSE(
3691           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3692   #endif
3693         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3694       } else {
3695         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3696         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3697         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3698       }
3699 
3700       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3701 #else
3702       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3703       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3704 #endif
3705     } else {
3706       if (cusparsestruct->nrows) {
3707 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3708         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3709 #else
3710         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3711         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3712 #endif
3713       }
3714     }
3715     PetscCall(PetscLogGpuTimeEnd());
3716 
3717     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3718       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3719         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3720           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3721         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3722           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3723         }
3724       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3725         PetscCall(VecSeq_CUDA::Set(zz, 0));
3726       }
3727 
3728       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3729       if (compressed) {
3730         PetscCall(PetscLogGpuTimeBegin());
3731         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3732         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3733         PetscCall(PetscLogGpuTimeEnd());
3734       }
3735     } else {
3736       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3737     }
3738     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3739     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3740     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3741   } catch (char *ex) {
3742     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3743   }
3744   if (yy) {
3745     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3746   } else {
3747     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3748   }
3749   PetscFunctionReturn(PETSC_SUCCESS);
3750 }
3751 
3752 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3753 {
3754   PetscFunctionBegin;
3755   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3756   PetscFunctionReturn(PETSC_SUCCESS);
3757 }
3758 
3759 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3760 {
3761   PetscFunctionBegin;
3762   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3763   PetscFunctionReturn(PETSC_SUCCESS);
3764 }
3765 
3766 /*@
3767   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3768   (the default parallel PETSc format).
3769 
3770   Collective
3771 
3772   Input Parameters:
3773 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3774 . m    - number of rows
3775 . n    - number of columns
3776 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3777 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3778 
3779   Output Parameter:
3780 . A - the matrix
3781 
3782   Level: intermediate
3783 
3784   Notes:
3785   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3786   calculations. For good matrix assembly performance the user should preallocate the matrix
3787   storage by setting the parameter `nz` (or the array `nnz`).
3788 
3789   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3790   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3791   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3792 
3793   The AIJ format, also called
3794   compressed row storage, is fully compatible with standard Fortran
3795   storage.  That is, the stored row and column indices can begin at
3796   either one (as in Fortran) or zero.
3797 
3798   Specify the preallocated storage with either nz or nnz (not both).
3799   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3800   allocation.
3801 
3802 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3803 @*/
3804 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3805 {
3806   PetscFunctionBegin;
3807   PetscCall(MatCreate(comm, A));
3808   PetscCall(MatSetSizes(*A, m, n, m, n));
3809   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3810   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3811   PetscFunctionReturn(PETSC_SUCCESS);
3812 }
3813 
3814 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3815 {
3816   PetscFunctionBegin;
3817   if (A->factortype == MAT_FACTOR_NONE) {
3818     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3819   } else {
3820     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3821   }
3822   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3823   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3824   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3825   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3826   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3827   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3828   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3829   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3830   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3831   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3832   PetscCall(MatDestroy_SeqAIJ(A));
3833   PetscFunctionReturn(PETSC_SUCCESS);
3834 }
3835 
3836 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3837 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3838 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3839 {
3840   PetscFunctionBegin;
3841   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3842   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3843   PetscFunctionReturn(PETSC_SUCCESS);
3844 }
3845 
3846 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3847 {
3848   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3849   Mat_SeqAIJCUSPARSE *cy;
3850   Mat_SeqAIJCUSPARSE *cx;
3851   PetscScalar        *ay;
3852   const PetscScalar  *ax;
3853   CsrMatrix          *csry, *csrx;
3854 
3855   PetscFunctionBegin;
3856   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3857   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3858   if (X->ops->axpy != Y->ops->axpy) {
3859     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3860     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3861     PetscFunctionReturn(PETSC_SUCCESS);
3862   }
3863   /* if we are here, it means both matrices are bound to GPU */
3864   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3865   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3866   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3867   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3868   csry = (CsrMatrix *)cy->mat->mat;
3869   csrx = (CsrMatrix *)cx->mat->mat;
3870   /* see if we can turn this into a cublas axpy */
3871   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3872     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3873     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3874     if (eq) str = SAME_NONZERO_PATTERN;
3875   }
3876   /* spgeam is buggy with one column */
3877   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3878 
3879   if (str == SUBSET_NONZERO_PATTERN) {
3880     PetscScalar b = 1.0;
3881 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3882     size_t bufferSize;
3883     void  *buffer;
3884 #endif
3885 
3886     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3887     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3888     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3889 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3890     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3891                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3892     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3893     PetscCall(PetscLogGpuTimeBegin());
3894     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3895                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3896     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3897     PetscCall(PetscLogGpuTimeEnd());
3898     PetscCallCUDA(cudaFree(buffer));
3899 #else
3900     PetscCall(PetscLogGpuTimeBegin());
3901     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3902                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3903     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3904     PetscCall(PetscLogGpuTimeEnd());
3905 #endif
3906     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3907     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3908     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3909     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3910   } else if (str == SAME_NONZERO_PATTERN) {
3911     cublasHandle_t cublasv2handle;
3912     PetscBLASInt   one = 1, bnz = 1;
3913 
3914     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3915     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3916     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3917     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3918     PetscCall(PetscLogGpuTimeBegin());
3919     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3920     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3921     PetscCall(PetscLogGpuTimeEnd());
3922     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3923     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3924     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3925   } else {
3926     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3927     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3928   }
3929   PetscFunctionReturn(PETSC_SUCCESS);
3930 }
3931 
3932 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3933 {
3934   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3935   PetscScalar   *ay;
3936   cublasHandle_t cublasv2handle;
3937   PetscBLASInt   one = 1, bnz = 1;
3938 
3939   PetscFunctionBegin;
3940   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3941   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3942   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3943   PetscCall(PetscLogGpuTimeBegin());
3944   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3945   PetscCall(PetscLogGpuFlops(bnz));
3946   PetscCall(PetscLogGpuTimeEnd());
3947   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3948   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3949   PetscFunctionReturn(PETSC_SUCCESS);
3950 }
3951 
3952 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3953 {
3954   PetscBool   both = PETSC_FALSE;
3955   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3956 
3957   PetscFunctionBegin;
3958   if (A->factortype == MAT_FACTOR_NONE) {
3959     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3960     if (spptr->mat) {
3961       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3962       if (matrix->values) {
3963         both = PETSC_TRUE;
3964         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3965       }
3966     }
3967     if (spptr->matTranspose) {
3968       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3969       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3970     }
3971   }
3972   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3973   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3974   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3975   else A->offloadmask = PETSC_OFFLOAD_CPU;
3976   PetscFunctionReturn(PETSC_SUCCESS);
3977 }
3978 
3979 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3980 {
3981   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3982 
3983   PetscFunctionBegin;
3984   if (A->factortype != MAT_FACTOR_NONE) {
3985     A->boundtocpu = flg;
3986     PetscFunctionReturn(PETSC_SUCCESS);
3987   }
3988   if (flg) {
3989     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3990 
3991     A->ops->scale                     = MatScale_SeqAIJ;
3992     A->ops->axpy                      = MatAXPY_SeqAIJ;
3993     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3994     A->ops->mult                      = MatMult_SeqAIJ;
3995     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3996     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3997     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3998     A->ops->multhermitiantranspose    = NULL;
3999     A->ops->multhermitiantransposeadd = NULL;
4000     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4001     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4002     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4003     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4004     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4005     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4006     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4007     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4008   } else {
4009     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4010     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4011     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4012     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4013     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4014     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4015     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4016     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4017     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4018     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4019     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4020     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4021     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4022     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4023     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4024     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4025     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4026 
4027     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4028     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4029     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4030     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4031     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4032     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4033   }
4034   A->boundtocpu = flg;
4035   if (flg && a->inode.size_csr) {
4036     a->inode.use = PETSC_TRUE;
4037   } else {
4038     a->inode.use = PETSC_FALSE;
4039   }
4040   PetscFunctionReturn(PETSC_SUCCESS);
4041 }
4042 
4043 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4044 {
4045   Mat B;
4046 
4047   PetscFunctionBegin;
4048   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4049   if (reuse == MAT_INITIAL_MATRIX) {
4050     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4051   } else if (reuse == MAT_REUSE_MATRIX) {
4052     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4053   }
4054   B = *newmat;
4055 
4056   PetscCall(PetscFree(B->defaultvectype));
4057   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4058 
4059   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4060     if (B->factortype == MAT_FACTOR_NONE) {
4061       Mat_SeqAIJCUSPARSE *spptr;
4062       PetscCall(PetscNew(&spptr));
4063       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4064       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4065       spptr->format = MAT_CUSPARSE_CSR;
4066 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4067   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4068       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4069   #else
4070       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4071   #endif
4072       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4073       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4074 #endif
4075       B->spptr = spptr;
4076     } else {
4077       Mat_SeqAIJCUSPARSETriFactors *spptr;
4078 
4079       PetscCall(PetscNew(&spptr));
4080       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4081       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4082       B->spptr = spptr;
4083     }
4084     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4085   }
4086   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4087   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4088   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4089   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4090   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4091   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4092 
4093   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4094   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4095   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4096 #if defined(PETSC_HAVE_HYPRE)
4097   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4098 #endif
4099   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4100   PetscFunctionReturn(PETSC_SUCCESS);
4101 }
4102 
4103 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4104 {
4105   PetscFunctionBegin;
4106   PetscCall(MatCreate_SeqAIJ(B));
4107   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4108   PetscFunctionReturn(PETSC_SUCCESS);
4109 }
4110 
4111 /*MC
4112    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4113 
4114    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4115    CSR, ELL, or Hybrid format.
4116    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4117 
4118    Options Database Keys:
4119 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4120 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4121                                       Other options include ell (ellpack) or hyb (hybrid).
4122 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4123 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4124 
4125   Level: beginner
4126 
4127 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4128 M*/
4129 
4130 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4131 {
4132   PetscFunctionBegin;
4133   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4134   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4135   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4136   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4137   PetscFunctionReturn(PETSC_SUCCESS);
4138 }
4139 
4140 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4141 {
4142   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4143 
4144   PetscFunctionBegin;
4145   if (cusp) {
4146     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4147     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4148     delete cusp->workVector;
4149     delete cusp->rowoffsets_gpu;
4150     delete cusp->csr2csc_i;
4151     delete cusp->coords;
4152     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4153     PetscCall(PetscFree(mat->spptr));
4154   }
4155   PetscFunctionReturn(PETSC_SUCCESS);
4156 }
4157 
4158 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4159 {
4160   PetscFunctionBegin;
4161   if (*mat) {
4162     delete (*mat)->values;
4163     delete (*mat)->column_indices;
4164     delete (*mat)->row_offsets;
4165     delete *mat;
4166     *mat = 0;
4167   }
4168   PetscFunctionReturn(PETSC_SUCCESS);
4169 }
4170 
4171 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4172 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4173 {
4174   PetscFunctionBegin;
4175   if (*trifactor) {
4176     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4177     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4178     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4179     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4180     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4181   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4182     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4183   #endif
4184     PetscCall(PetscFree(*trifactor));
4185   }
4186   PetscFunctionReturn(PETSC_SUCCESS);
4187 }
4188 #endif
4189 
4190 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4191 {
4192   CsrMatrix *mat;
4193 
4194   PetscFunctionBegin;
4195   if (*matstruct) {
4196     if ((*matstruct)->mat) {
4197       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4198 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4199         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4200 #else
4201         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4202         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4203 #endif
4204       } else {
4205         mat = (CsrMatrix *)(*matstruct)->mat;
4206         PetscCall(CsrMatrix_Destroy(&mat));
4207       }
4208     }
4209     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4210     delete (*matstruct)->cprowIndices;
4211     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4212     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4213     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4214 
4215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4217     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4218 
4219     for (int i = 0; i < 3; i++) {
4220       if (mdata->cuSpMV[i].initialized) {
4221         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4222         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4223         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4224   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4225         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4226         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4227   #endif
4228       }
4229     }
4230 #endif
4231     delete *matstruct;
4232     *matstruct = NULL;
4233   }
4234   PetscFunctionReturn(PETSC_SUCCESS);
4235 }
4236 
4237 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4238 {
4239   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4240 
4241   PetscFunctionBegin;
4242   if (fs) {
4243 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4244     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4245     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4246     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4247     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4248     delete fs->workVector;
4249     fs->workVector = NULL;
4250 #endif
4251     delete fs->rpermIndices;
4252     delete fs->cpermIndices;
4253     fs->rpermIndices  = NULL;
4254     fs->cpermIndices  = NULL;
4255     fs->init_dev_prop = PETSC_FALSE;
4256 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4257     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4258     PetscCallCUDA(cudaFree(fs->csrColIdx));
4259     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4260     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4261     PetscCallCUDA(cudaFree(fs->csrVal));
4262     PetscCallCUDA(cudaFree(fs->diag));
4263     PetscCallCUDA(cudaFree(fs->X));
4264     PetscCallCUDA(cudaFree(fs->Y));
4265     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4266     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4267     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4268     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4269     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4270     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4271     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4272     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4273     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4274     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4275     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4276     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4277     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4278     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4279     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4280     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4281     PetscCall(PetscFree(fs->csrRowPtr_h));
4282     PetscCall(PetscFree(fs->csrVal_h));
4283     PetscCall(PetscFree(fs->diag_h));
4284     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4285     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4286 #endif
4287   }
4288   PetscFunctionReturn(PETSC_SUCCESS);
4289 }
4290 
4291 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4292 {
4293   PetscFunctionBegin;
4294   if (*trifactors) {
4295     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4296     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4297     PetscCall(PetscFree(*trifactors));
4298   }
4299   PetscFunctionReturn(PETSC_SUCCESS);
4300 }
4301 
4302 struct IJCompare {
4303   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4304   {
4305     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4306     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4307     return false;
4308   }
4309 };
4310 
4311 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4312 {
4313   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4314 
4315   PetscFunctionBegin;
4316   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4317   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4318   if (destroy) {
4319     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4320     delete cusp->csr2csc_i;
4321     cusp->csr2csc_i = NULL;
4322   }
4323   A->transupdated = PETSC_FALSE;
4324   PetscFunctionReturn(PETSC_SUCCESS);
4325 }
4326 
4327 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4328 {
4329   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4330 
4331   PetscFunctionBegin;
4332   PetscCallCUDA(cudaFree(coo->perm));
4333   PetscCallCUDA(cudaFree(coo->jmap));
4334   PetscCall(PetscFree(coo));
4335   PetscFunctionReturn(PETSC_SUCCESS);
4336 }
4337 
4338 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4339 {
4340   PetscBool            dev_ij = PETSC_FALSE;
4341   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4342   PetscInt            *i, *j;
4343   PetscContainer       container_h;
4344   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4345 
4346   PetscFunctionBegin;
4347   PetscCall(PetscGetMemType(coo_i, &mtype));
4348   if (PetscMemTypeDevice(mtype)) {
4349     dev_ij = PETSC_TRUE;
4350     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4351     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4352     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4353   } else {
4354     i = coo_i;
4355     j = coo_j;
4356   }
4357 
4358   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4359   if (dev_ij) PetscCall(PetscFree2(i, j));
4360   mat->offloadmask = PETSC_OFFLOAD_CPU;
4361   // Create the GPU memory
4362   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4363 
4364   // Copy the COO struct to device
4365   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4366   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4367   PetscCall(PetscMalloc1(1, &coo_d));
4368   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4369   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4370   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4371   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4372   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4373 
4374   // Put the COO struct in a container and then attach that to the matrix
4375   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4376   PetscFunctionReturn(PETSC_SUCCESS);
4377 }
4378 
4379 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4380 {
4381   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4382   const PetscCount grid_size = gridDim.x * blockDim.x;
4383   for (; i < nnz; i += grid_size) {
4384     PetscScalar sum = 0.0;
4385     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4386     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4387   }
4388 }
4389 
4390 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4391 {
4392   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4393   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4394   PetscCount           Annz = seq->nz;
4395   PetscMemType         memtype;
4396   const PetscScalar   *v1 = v;
4397   PetscScalar         *Aa;
4398   PetscContainer       container;
4399   MatCOOStruct_SeqAIJ *coo;
4400 
4401   PetscFunctionBegin;
4402   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4403 
4404   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4405   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4406 
4407   PetscCall(PetscGetMemType(v, &memtype));
4408   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4409     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4410     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4411   }
4412 
4413   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4414   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4415 
4416   PetscCall(PetscLogGpuTimeBegin());
4417   if (Annz) {
4418     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4419     PetscCallCUDA(cudaPeekAtLastError());
4420   }
4421   PetscCall(PetscLogGpuTimeEnd());
4422 
4423   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4424   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4425 
4426   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4427   PetscFunctionReturn(PETSC_SUCCESS);
4428 }
4429 
4430 /*@C
4431   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4432 
4433   Not Collective
4434 
4435   Input Parameters:
4436 + A          - the matrix
4437 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4438 
4439   Output Parameters:
4440 + i - the CSR row pointers
4441 - j - the CSR column indices
4442 
4443   Level: developer
4444 
4445   Note:
4446   When compressed is true, the CSR structure does not contain empty rows
4447 
4448 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4449 @*/
4450 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4451 {
4452   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4453   CsrMatrix          *csr;
4454   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4455 
4456   PetscFunctionBegin;
4457   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4458   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4459   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4460   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4461   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4462   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4463   csr = (CsrMatrix *)cusp->mat->mat;
4464   if (i) {
4465     if (!compressed && a->compressedrow.use) { /* need full row offset */
4466       if (!cusp->rowoffsets_gpu) {
4467         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4468         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4469         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4470       }
4471       *i = cusp->rowoffsets_gpu->data().get();
4472     } else *i = csr->row_offsets->data().get();
4473   }
4474   if (j) *j = csr->column_indices->data().get();
4475   PetscFunctionReturn(PETSC_SUCCESS);
4476 }
4477 
4478 /*@C
4479   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4480 
4481   Not Collective
4482 
4483   Input Parameters:
4484 + A          - the matrix
4485 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4486 . i          - the CSR row pointers
4487 - j          - the CSR column indices
4488 
4489   Level: developer
4490 
4491 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4492 @*/
4493 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4494 {
4495   PetscFunctionBegin;
4496   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4497   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4498   if (i) *i = NULL;
4499   if (j) *j = NULL;
4500   (void)compressed;
4501   PetscFunctionReturn(PETSC_SUCCESS);
4502 }
4503 
4504 /*@C
4505   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4506 
4507   Not Collective
4508 
4509   Input Parameter:
4510 . A - a `MATSEQAIJCUSPARSE` matrix
4511 
4512   Output Parameter:
4513 . a - pointer to the device data
4514 
4515   Level: developer
4516 
4517   Note:
4518   May trigger host-device copies if up-to-date matrix data is on host
4519 
4520 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4521 @*/
4522 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4523 {
4524   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4525   CsrMatrix          *csr;
4526 
4527   PetscFunctionBegin;
4528   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4529   PetscAssertPointer(a, 2);
4530   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4531   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4532   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4533   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534   csr = (CsrMatrix *)cusp->mat->mat;
4535   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536   *a = csr->values->data().get();
4537   PetscFunctionReturn(PETSC_SUCCESS);
4538 }
4539 
4540 /*@C
4541   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4542 
4543   Not Collective
4544 
4545   Input Parameters:
4546 + A - a `MATSEQAIJCUSPARSE` matrix
4547 - a - pointer to the device data
4548 
4549   Level: developer
4550 
4551 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4552 @*/
4553 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4554 {
4555   PetscFunctionBegin;
4556   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4557   PetscAssertPointer(a, 2);
4558   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4559   *a = NULL;
4560   PetscFunctionReturn(PETSC_SUCCESS);
4561 }
4562 
4563 /*@C
4564   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4565 
4566   Not Collective
4567 
4568   Input Parameter:
4569 . A - a `MATSEQAIJCUSPARSE` matrix
4570 
4571   Output Parameter:
4572 . a - pointer to the device data
4573 
4574   Level: developer
4575 
4576   Note:
4577   May trigger host-device copies if up-to-date matrix data is on host
4578 
4579 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4580 @*/
4581 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4582 {
4583   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4584   CsrMatrix          *csr;
4585 
4586   PetscFunctionBegin;
4587   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4588   PetscAssertPointer(a, 2);
4589   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4591   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4592   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593   csr = (CsrMatrix *)cusp->mat->mat;
4594   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595   *a             = csr->values->data().get();
4596   A->offloadmask = PETSC_OFFLOAD_GPU;
4597   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4598   PetscFunctionReturn(PETSC_SUCCESS);
4599 }
4600 /*@C
4601   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4602 
4603   Not Collective
4604 
4605   Input Parameters:
4606 + A - a `MATSEQAIJCUSPARSE` matrix
4607 - a - pointer to the device data
4608 
4609   Level: developer
4610 
4611 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4612 @*/
4613 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4614 {
4615   PetscFunctionBegin;
4616   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4617   PetscAssertPointer(a, 2);
4618   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4619   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4620   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4621   *a = NULL;
4622   PetscFunctionReturn(PETSC_SUCCESS);
4623 }
4624 
4625 /*@C
4626   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4627 
4628   Not Collective
4629 
4630   Input Parameter:
4631 . A - a `MATSEQAIJCUSPARSE` matrix
4632 
4633   Output Parameter:
4634 . a - pointer to the device data
4635 
4636   Level: developer
4637 
4638   Note:
4639   Does not trigger host-device copies and flags data validity on the GPU
4640 
4641 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4642 @*/
4643 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4644 {
4645   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4646   CsrMatrix          *csr;
4647 
4648   PetscFunctionBegin;
4649   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4650   PetscAssertPointer(a, 2);
4651   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4652   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4653   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4654   csr = (CsrMatrix *)cusp->mat->mat;
4655   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4656   *a             = csr->values->data().get();
4657   A->offloadmask = PETSC_OFFLOAD_GPU;
4658   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4659   PetscFunctionReturn(PETSC_SUCCESS);
4660 }
4661 
4662 /*@C
4663   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4664 
4665   Not Collective
4666 
4667   Input Parameters:
4668 + A - a `MATSEQAIJCUSPARSE` matrix
4669 - a - pointer to the device data
4670 
4671   Level: developer
4672 
4673 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4674 @*/
4675 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4676 {
4677   PetscFunctionBegin;
4678   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4679   PetscAssertPointer(a, 2);
4680   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4681   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4682   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4683   *a = NULL;
4684   PetscFunctionReturn(PETSC_SUCCESS);
4685 }
4686 
4687 struct IJCompare4 {
4688   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4689   {
4690     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4691     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4692     return false;
4693   }
4694 };
4695 
4696 struct Shift {
4697   int _shift;
4698 
4699   Shift(int shift) : _shift(shift) { }
4700   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4701 };
4702 
4703 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4704 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4705 {
4706   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4707   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4708   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4709   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4710   PetscInt                      Annz, Bnnz;
4711   cusparseStatus_t              stat;
4712   PetscInt                      i, m, n, zero = 0;
4713 
4714   PetscFunctionBegin;
4715   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4716   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4717   PetscAssertPointer(C, 4);
4718   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4719   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4720   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4721   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4722   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4723   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4724   if (reuse == MAT_INITIAL_MATRIX) {
4725     m = A->rmap->n;
4726     n = A->cmap->n + B->cmap->n;
4727     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4728     PetscCall(MatSetSizes(*C, m, n, m, n));
4729     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4730     c                       = (Mat_SeqAIJ *)(*C)->data;
4731     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4732     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4733     Ccsr                    = new CsrMatrix;
4734     Cmat->cprowIndices      = NULL;
4735     c->compressedrow.use    = PETSC_FALSE;
4736     c->compressedrow.nrows  = 0;
4737     c->compressedrow.i      = NULL;
4738     c->compressedrow.rindex = NULL;
4739     Ccusp->workVector       = NULL;
4740     Ccusp->nrows            = m;
4741     Ccusp->mat              = Cmat;
4742     Ccusp->mat->mat         = Ccsr;
4743     Ccsr->num_rows          = m;
4744     Ccsr->num_cols          = n;
4745     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4746     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4747     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4748     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4749     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4750     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4751     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4752     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4753     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4754     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4755     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4756     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4757     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4758 
4759     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4760     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4761     Annz                 = (PetscInt)Acsr->column_indices->size();
4762     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4763     c->nz                = Annz + Bnnz;
4764     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4765     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4766     Ccsr->values         = new THRUSTARRAY(c->nz);
4767     Ccsr->num_entries    = c->nz;
4768     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4769     if (c->nz) {
4770       auto              Acoo = new THRUSTINTARRAY32(Annz);
4771       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4772       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4773       THRUSTINTARRAY32 *Aroff, *Broff;
4774 
4775       if (a->compressedrow.use) { /* need full row offset */
4776         if (!Acusp->rowoffsets_gpu) {
4777           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4778           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4779           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4780         }
4781         Aroff = Acusp->rowoffsets_gpu;
4782       } else Aroff = Acsr->row_offsets;
4783       if (b->compressedrow.use) { /* need full row offset */
4784         if (!Bcusp->rowoffsets_gpu) {
4785           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4786           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4787           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4788         }
4789         Broff = Bcusp->rowoffsets_gpu;
4790       } else Broff = Bcsr->row_offsets;
4791       PetscCall(PetscLogGpuTimeBegin());
4792       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4793       PetscCallCUSPARSE(stat);
4794       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4795       PetscCallCUSPARSE(stat);
4796       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4797       auto Aperm = thrust::make_constant_iterator(1);
4798       auto Bperm = thrust::make_constant_iterator(0);
4799 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4800       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4801       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4802 #else
4803       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4804       auto Bcib = Bcsr->column_indices->begin();
4805       auto Bcie = Bcsr->column_indices->end();
4806       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4807 #endif
4808       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4809       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4810       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4811       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4812       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4813       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4814       auto p1    = Ccusp->coords->begin();
4815       auto p2    = Ccusp->coords->begin();
4816       thrust::advance(p2, Annz);
4817       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4818 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4819       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4820 #endif
4821       auto cci = thrust::make_counting_iterator(zero);
4822       auto cce = thrust::make_counting_iterator(c->nz);
4823 #if 0 //Errors on SUMMIT cuda 11.1.0
4824       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4825 #else
4826   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4827       auto pred = thrust::identity<int>();
4828   #else
4829       auto pred = cuda::std::identity();
4830   #endif
4831       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4832       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4833 #endif
4834       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4835       PetscCallCUSPARSE(stat);
4836       PetscCall(PetscLogGpuTimeEnd());
4837       delete wPerm;
4838       delete Acoo;
4839       delete Bcoo;
4840       delete Ccoo;
4841 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4842       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4843       PetscCallCUSPARSE(stat);
4844 #endif
4845       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4846         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4847         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4848         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4849         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4850         CsrMatrix                    *CcsrT = new CsrMatrix;
4851         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4852         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4853 
4854         (*C)->form_explicit_transpose = PETSC_TRUE;
4855         (*C)->transupdated            = PETSC_TRUE;
4856         Ccusp->rowoffsets_gpu         = NULL;
4857         CmatT->cprowIndices           = NULL;
4858         CmatT->mat                    = CcsrT;
4859         CcsrT->num_rows               = n;
4860         CcsrT->num_cols               = m;
4861         CcsrT->num_entries            = c->nz;
4862 
4863         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4864         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4865         CcsrT->values         = new THRUSTARRAY(c->nz);
4866 
4867         PetscCall(PetscLogGpuTimeBegin());
4868         auto rT = CcsrT->row_offsets->begin();
4869         if (AT) {
4870           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4871           thrust::advance(rT, -1);
4872         }
4873         if (BT) {
4874           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4875           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4876           thrust::copy(titb, tite, rT);
4877         }
4878         auto cT = CcsrT->column_indices->begin();
4879         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4880         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4881         auto vT = CcsrT->values->begin();
4882         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4883         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4884         PetscCall(PetscLogGpuTimeEnd());
4885 
4886         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4887         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4888         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4889         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4890         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4891         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4892         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4893         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4894         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4895 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4896         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4897         PetscCallCUSPARSE(stat);
4898 #endif
4899         Ccusp->matTranspose = CmatT;
4900       }
4901     }
4902 
4903     c->free_a = PETSC_TRUE;
4904     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4905     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4906     c->free_ij = PETSC_TRUE;
4907     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4908       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4909       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4910       ii = *Ccsr->row_offsets;
4911       jj = *Ccsr->column_indices;
4912       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4913       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4914     } else {
4915       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4916       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917     }
4918     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4919     PetscCall(PetscMalloc1(m, &c->ilen));
4920     PetscCall(PetscMalloc1(m, &c->imax));
4921     c->maxnz         = c->nz;
4922     c->nonzerorowcnt = 0;
4923     c->rmax          = 0;
4924     for (i = 0; i < m; i++) {
4925       const PetscInt nn = c->i[i + 1] - c->i[i];
4926       c->ilen[i] = c->imax[i] = nn;
4927       c->nonzerorowcnt += (PetscInt)!!nn;
4928       c->rmax = PetscMax(c->rmax, nn);
4929     }
4930     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4931     PetscCall(PetscMalloc1(c->nz, &c->a));
4932     (*C)->nonzerostate++;
4933     PetscCall(PetscLayoutSetUp((*C)->rmap));
4934     PetscCall(PetscLayoutSetUp((*C)->cmap));
4935     Ccusp->nonzerostate = (*C)->nonzerostate;
4936     (*C)->preallocated  = PETSC_TRUE;
4937   } else {
4938     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4939     c = (Mat_SeqAIJ *)(*C)->data;
4940     if (c->nz) {
4941       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4942       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4943       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4944       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4945       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4946       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4947       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4948       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4949       Acsr = (CsrMatrix *)Acusp->mat->mat;
4950       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4951       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4952       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4953       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4954       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4955       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4956       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4957       auto pmid = Ccusp->coords->begin();
4958       thrust::advance(pmid, Acsr->num_entries);
4959       PetscCall(PetscLogGpuTimeBegin());
4960       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4961       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4962       thrust::for_each(zibait, zieait, VecCUDAEquals());
4963       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4964       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4965       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4966       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4967       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4968         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4969         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4970         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4971         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4972         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4973         auto       vT    = CcsrT->values->begin();
4974         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4975         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4976         (*C)->transupdated = PETSC_TRUE;
4977       }
4978       PetscCall(PetscLogGpuTimeEnd());
4979     }
4980   }
4981   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4982   (*C)->assembled     = PETSC_TRUE;
4983   (*C)->was_assembled = PETSC_FALSE;
4984   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4985   PetscFunctionReturn(PETSC_SUCCESS);
4986 }
4987 
4988 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4989 {
4990   bool               dmem;
4991   const PetscScalar *av;
4992 
4993   PetscFunctionBegin;
4994   dmem = isCudaMem(v);
4995   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4996   if (n && idx) {
4997     THRUSTINTARRAY widx(n);
4998     widx.assign(idx, idx + n);
4999     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5000 
5001     THRUSTARRAY                    *w = NULL;
5002     thrust::device_ptr<PetscScalar> dv;
5003     if (dmem) {
5004       dv = thrust::device_pointer_cast(v);
5005     } else {
5006       w  = new THRUSTARRAY(n);
5007       dv = w->data();
5008     }
5009     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5010 
5011     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5012     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5013     thrust::for_each(zibit, zieit, VecCUDAEquals());
5014     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5015     delete w;
5016   } else {
5017     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5018   }
5019   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5020   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5021   PetscFunctionReturn(PETSC_SUCCESS);
5022 }
5023 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
5024