xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 421480d92be24cdb9933c60510b8e175c0a8d034)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14
18d0967f54SJacob Faibussowitsch #endif
19a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
20a2cee5feSJed Brown #include <thrust/remove.h>
21a2cee5feSJed Brown #include <thrust/sort.h>
22a2cee5feSJed Brown #include <thrust/unique.h>
2359c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
2459c3d2bbSPierre Jolivet   #include <cuda/std/functional>
2559c3d2bbSPierre Jolivet #endif
26e8d2b73aSMark Adams 
27e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2946aba097SBarry Smith /*
3046aba097SBarry Smith   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31afb2bd1cSJunchao Zhang   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32afb2bd1cSJunchao Zhang */
33afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36afb2bd1cSJunchao Zhang #endif
379ae82921SPaul Mullowney 
38087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
416fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
436fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
466fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48d460d7bfSJunchao Zhang #endif
49ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
5133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
526fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
536fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
546fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
599ae82921SPaul Mullowney 
607f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
632c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
647f756511SDominic Meiser 
6557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
6757181aedSStefano Zampini 
68c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71c215019aSStefano Zampini 
72d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73d71ae5a4SJacob Faibussowitsch {
74aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
756e111a19SKarl Rupp 
76ca45077fSPaul Mullowney   PetscFunctionBegin;
77ca45077fSPaul Mullowney   switch (op) {
78d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
79d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
80d71ae5a4SJacob Faibussowitsch     break;
81d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
82d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
83d71ae5a4SJacob Faibussowitsch     break;
84d71ae5a4SJacob Faibussowitsch   default:
85d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86ca45077fSPaul Mullowney   }
873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
88ca45077fSPaul Mullowney }
899ae82921SPaul Mullowney 
90e057df02SPaul Mullowney /*@
9111a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
9211a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
9311a5261eSBarry Smith 
94e057df02SPaul Mullowney   Not Collective
95e057df02SPaul Mullowney 
96e057df02SPaul Mullowney   Input Parameters:
9711a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
982ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
992ef1f0ffSBarry Smith            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
10011a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101e057df02SPaul Mullowney 
102e057df02SPaul Mullowney   Level: intermediate
103e057df02SPaul Mullowney 
104fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105e057df02SPaul Mullowney @*/
106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107d71ae5a4SJacob Faibussowitsch {
108e057df02SPaul Mullowney   PetscFunctionBegin;
109e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112e057df02SPaul Mullowney }
113e057df02SPaul Mullowney 
114d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115d71ae5a4SJacob Faibussowitsch {
116365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117365b711fSMark Adams 
118365b711fSMark Adams   PetscFunctionBegin;
119365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
121365b711fSMark Adams }
122365b711fSMark Adams 
123365b711fSMark Adams /*@
12411a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125365b711fSMark Adams 
126365b711fSMark Adams   Input Parameters:
12711a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
12811a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
129365b711fSMark Adams 
1302ef1f0ffSBarry Smith   Level: intermediate
131365b711fSMark Adams 
13211a5261eSBarry Smith   Note:
13353220ed8SBarry Smith   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
13453220ed8SBarry Smith   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136365b711fSMark Adams 
1371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138365b711fSMark Adams @*/
139d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140d71ae5a4SJacob Faibussowitsch {
141365b711fSMark Adams   PetscFunctionBegin;
142365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145365b711fSMark Adams }
146365b711fSMark Adams 
14766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148d71ae5a4SJacob Faibussowitsch {
149e6e9a74fSStefano Zampini   PetscFunctionBegin;
1501a2c6b5cSJunchao Zhang   switch (op) {
1511a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1521a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1539566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1541a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1551a2c6b5cSJunchao Zhang     break;
156d71ae5a4SJacob Faibussowitsch   default:
157d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158d71ae5a4SJacob Faibussowitsch     break;
159e6e9a74fSStefano Zampini   }
1603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
161e6e9a74fSStefano Zampini }
162e6e9a74fSStefano Zampini 
163ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164d71ae5a4SJacob Faibussowitsch {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1669ae82921SPaul Mullowney   PetscBool                flg;
167a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1686e111a19SKarl Rupp 
1699ae82921SPaul Mullowney   PetscFunctionBegin;
170d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1719ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1729371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1739566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174afb2bd1cSJunchao Zhang 
1759371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1769566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
1779566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
1789566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1809371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184a435da06SStefano Zampini   #else
185aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186a435da06SStefano Zampini   #endif
1879371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189afb2bd1cSJunchao Zhang 
1909371c9d4SSatish Balay     PetscCall(
1919371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193afb2bd1cSJunchao Zhang #endif
1944c87dfd4SPaul Mullowney   }
195d0609cedSBarry Smith   PetscOptionsHeadEnd();
1963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
199b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201d460d7bfSJunchao Zhang {
202d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
204d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205*421480d9SBarry Smith   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
206d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
207d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
208d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
209d460d7bfSJunchao Zhang 
210d460d7bfSJunchao Zhang   PetscFunctionBegin;
211*421480d9SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
212d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
213d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
214d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
215*421480d9SBarry Smith       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
216d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
217d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
218d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
219d460d7bfSJunchao Zhang       Mi[0] = 0;
220d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
221d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
222*421480d9SBarry Smith         PetscInt ulen = adiag[i] - adiag[i + 1];
223d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
224d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
225*421480d9SBarry Smith         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
226d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
227d460d7bfSJunchao Zhang       }
228d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
229f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
230f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
231f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
232f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
233f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
234d460d7bfSJunchao Zhang 
235d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
236d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
237d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
238d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
239d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
240d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
241d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
242d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
243d460d7bfSJunchao Zhang 
244d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
245d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
246d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
247d460d7bfSJunchao Zhang 
248d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
249d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
250d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
251d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
252d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
253d460d7bfSJunchao Zhang 
254d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
256f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
257d460d7bfSJunchao Zhang 
258d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
259d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
260d460d7bfSJunchao Zhang 
261d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
262d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
263d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
264d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
265d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
266d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
267d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
268d460d7bfSJunchao Zhang 
269d460d7bfSJunchao Zhang       // Record for reuse
270d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
271d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
272d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
273d460d7bfSJunchao Zhang     }
274d460d7bfSJunchao Zhang     // Copy the value
275d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
276d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
277d460d7bfSJunchao Zhang     Mnz = Mi[m];
278d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
279d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
280*421480d9SBarry Smith       PetscInt ulen = adiag[i] - adiag[i + 1];
281d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
282*421480d9SBarry Smith       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
283*421480d9SBarry Smith       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
284d460d7bfSJunchao Zhang     }
285d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
286d460d7bfSJunchao Zhang 
287204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
288204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
289204a0e31SJunchao Zhang       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
290204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292204a0e31SJunchao Zhang     } else
293204a0e31SJunchao Zhang   #endif
294204a0e31SJunchao Zhang     {
295d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
296d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
297d460d7bfSJunchao Zhang 
298d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
299204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis          = PETSC_TRUE;
300d460d7bfSJunchao Zhang       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
301d460d7bfSJunchao Zhang     }
302204a0e31SJunchao Zhang   }
303d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
304d460d7bfSJunchao Zhang }
305d460d7bfSJunchao Zhang #else
306d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
307d71ae5a4SJacob Faibussowitsch {
3089ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3099ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3109ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
311aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3129ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3139ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3149ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3159ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3169ae82921SPaul Mullowney 
3179ae82921SPaul Mullowney   PetscFunctionBegin;
3183ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
319c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3209ae82921SPaul Mullowney     try {
3219ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3229ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
323da79fbbcSStefano Zampini       if (!loTriFactor) {
3242cbc15d9SMark         PetscScalar *AALo;
3252cbc15d9SMark 
3269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3309566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3319ae82921SPaul Mullowney 
3329ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3339ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3349ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3359ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3369ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3379ae82921SPaul Mullowney         v         = aa;
3389ae82921SPaul Mullowney         vi        = aj;
3399ae82921SPaul Mullowney         offset    = 1;
3409ae82921SPaul Mullowney         rowOffset = 1;
3419ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3429ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
343e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3449ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3459ae82921SPaul Mullowney           rowOffset += nz + 1;
3469ae82921SPaul Mullowney 
347f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
348f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3499ae82921SPaul Mullowney 
3509ae82921SPaul Mullowney           offset += nz;
3519ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3529ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3539ae82921SPaul Mullowney           offset += 1;
3549ae82921SPaul Mullowney 
3559ae82921SPaul Mullowney           v += nz;
3569ae82921SPaul Mullowney           vi += nz;
3579ae82921SPaul Mullowney         }
3582205254eSKarl Rupp 
359aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3609566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
361da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
362aa372e3fSPaul Mullowney         /* Create the matrix description */
3639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3651b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
367afb2bd1cSJunchao Zhang   #else
3689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
369afb2bd1cSJunchao Zhang   #endif
3709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
372aa372e3fSPaul Mullowney 
373aa372e3fSPaul Mullowney         /* set the operation */
374aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
375aa372e3fSPaul Mullowney 
376aa372e3fSPaul Mullowney         /* set the matrix */
377aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
378aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
379aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
380aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
381aa372e3fSPaul Mullowney 
382aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
383aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
384aa372e3fSPaul Mullowney 
385aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
386aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
387aa372e3fSPaul Mullowney 
388aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
389aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
390aa372e3fSPaul Mullowney 
391afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3929566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
393261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3941b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3959371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3969371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
398afb2bd1cSJunchao Zhang   #endif
399afb2bd1cSJunchao Zhang 
400aa372e3fSPaul Mullowney         /* perform the solve analysis */
4019371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4029f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4039566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4049566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
405aa372e3fSPaul Mullowney 
406da79fbbcSStefano Zampini         /* assign the pointer */
407aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4082cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4099566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4109566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4119566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
412da79fbbcSStefano Zampini       } else { /* update values only */
41348a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
414da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4152cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
416da79fbbcSStefano Zampini         v                    = aa;
417da79fbbcSStefano Zampini         vi                   = aj;
418da79fbbcSStefano Zampini         offset               = 1;
419da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
420da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
421f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
422da79fbbcSStefano Zampini           offset += nz;
4232cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
424da79fbbcSStefano Zampini           offset += 1;
425da79fbbcSStefano Zampini           v += nz;
426da79fbbcSStefano Zampini         }
4272cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4289566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
429da79fbbcSStefano Zampini       }
430d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
431d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
432d71ae5a4SJacob Faibussowitsch     }
4339ae82921SPaul Mullowney   }
4343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4359ae82921SPaul Mullowney }
4369ae82921SPaul Mullowney 
437d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
438d71ae5a4SJacob Faibussowitsch {
4399ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4409ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4419ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
442aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
443*421480d9SBarry Smith   const PetscInt                    *aj                 = a->j, *adiag, *vi;
4449ae82921SPaul Mullowney   const MatScalar                   *aa                 = a->a, *v;
4459ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4469ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4479ae82921SPaul Mullowney 
4489ae82921SPaul Mullowney   PetscFunctionBegin;
4493ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
450*421480d9SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
451c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4529ae82921SPaul Mullowney     try {
4539ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4549ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
455da79fbbcSStefano Zampini       if (!upTriFactor) {
4562cbc15d9SMark         PetscScalar *AAUp;
4572cbc15d9SMark 
4589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4592cbc15d9SMark 
4609ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4639ae82921SPaul Mullowney 
4649ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4659ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4669ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4679ae82921SPaul Mullowney         offset  = nzUpper;
4689ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4699ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4709ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4719ae82921SPaul Mullowney 
472e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4739ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4749ae82921SPaul Mullowney 
475e057df02SPaul Mullowney           /* decrement the offset */
4769ae82921SPaul Mullowney           offset -= (nz + 1);
4779ae82921SPaul Mullowney 
478e057df02SPaul Mullowney           /* first, set the diagonal elements */
4799ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
48009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4819ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4829ae82921SPaul Mullowney 
483f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
484f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
4859ae82921SPaul Mullowney         }
4862205254eSKarl Rupp 
487aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4889566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
489da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4902205254eSKarl Rupp 
491aa372e3fSPaul Mullowney         /* Create the matrix description */
4929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4941b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
496afb2bd1cSJunchao Zhang   #else
4979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
498afb2bd1cSJunchao Zhang   #endif
4999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5009566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
501aa372e3fSPaul Mullowney 
502aa372e3fSPaul Mullowney         /* set the operation */
503aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
504aa372e3fSPaul Mullowney 
505aa372e3fSPaul Mullowney         /* set the matrix */
506aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
507aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
508aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
509aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
510aa372e3fSPaul Mullowney 
511aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
512aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
513aa372e3fSPaul Mullowney 
514aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
515aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
516aa372e3fSPaul Mullowney 
517aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
518aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
519aa372e3fSPaul Mullowney 
520afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5219566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
522261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5231b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5249371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5259371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
527afb2bd1cSJunchao Zhang   #endif
528afb2bd1cSJunchao Zhang 
529aa372e3fSPaul Mullowney         /* perform the solve analysis */
5309371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5319f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5329f7ba44dSJacob Faibussowitsch 
5339566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5349566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
535aa372e3fSPaul Mullowney 
536da79fbbcSStefano Zampini         /* assign the pointer */
537aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5382cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5399566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5409566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5419566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
542da79fbbcSStefano Zampini       } else {
54348a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
544da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
545da79fbbcSStefano Zampini         offset = nzUpper;
546da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
547da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
548da79fbbcSStefano Zampini 
549da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
550da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
551da79fbbcSStefano Zampini 
552da79fbbcSStefano Zampini           /* decrement the offset */
553da79fbbcSStefano Zampini           offset -= (nz + 1);
554da79fbbcSStefano Zampini 
555da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5562cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
557f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
558da79fbbcSStefano Zampini         }
5592cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5609566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
561da79fbbcSStefano Zampini       }
562d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
563d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
564d71ae5a4SJacob Faibussowitsch     }
5659ae82921SPaul Mullowney   }
5663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5679ae82921SPaul Mullowney }
568d460d7bfSJunchao Zhang #endif
5699ae82921SPaul Mullowney 
570d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
571d71ae5a4SJacob Faibussowitsch {
5729ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5739ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
574c9e33d71SJunchao Zhang   IS                            isrow = a->row, isicol = a->icol;
5759ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5769ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5779ae82921SPaul Mullowney 
5789ae82921SPaul Mullowney   PetscFunctionBegin;
57928b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
580b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
581d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
582d460d7bfSJunchao Zhang #else
5839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
585ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
586d460d7bfSJunchao Zhang #endif
587d460d7bfSJunchao Zhang 
588aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5899ae82921SPaul Mullowney 
590d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
591e057df02SPaul Mullowney   /* lower triangular indices */
5929566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
593da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
594da79fbbcSStefano Zampini     const PetscInt *r;
595da79fbbcSStefano Zampini 
5969566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
597aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
598aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5999566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6009566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
601da79fbbcSStefano Zampini   }
6029ae82921SPaul Mullowney 
603e057df02SPaul Mullowney   /* upper triangular indices */
604c9e33d71SJunchao Zhang   PetscCall(ISIdentity(isicol, &col_identity));
605da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
606da79fbbcSStefano Zampini     const PetscInt *c;
607da79fbbcSStefano Zampini 
608c9e33d71SJunchao Zhang     PetscCall(ISGetIndices(isicol, &c));
609aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
610aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
611c9e33d71SJunchao Zhang     PetscCall(ISRestoreIndices(isicol, &c));
6129566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
613da79fbbcSStefano Zampini   }
6143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6159ae82921SPaul Mullowney }
6169ae82921SPaul Mullowney 
617b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
618d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
619d460d7bfSJunchao Zhang {
620d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
621d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
622d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
623*421480d9SBarry Smith   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
624d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
625d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
626d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
627d460d7bfSJunchao Zhang 
628d460d7bfSJunchao Zhang   PetscFunctionBegin;
629*421480d9SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
630d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
631d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
632d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
633d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
634d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
635d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
636d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
637d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
638d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
639d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
640d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
641d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
642d460d7bfSJunchao Zhang       }
643d460d7bfSJunchao Zhang       // Copy M (U) from host to device
644f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
645f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
646f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
647f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
648d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
649d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
650d460d7bfSJunchao Zhang 
651d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
652d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
653d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
654d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
655d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
656d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
657d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
658d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
659d460d7bfSJunchao Zhang 
660d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
661d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
662d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
663d460d7bfSJunchao Zhang 
664d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
665f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
666f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
667d460d7bfSJunchao Zhang 
668d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
669d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
670d460d7bfSJunchao Zhang 
671d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
672d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
673d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
674d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
675d460d7bfSJunchao Zhang 
676aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
677d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
678d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
679d460d7bfSJunchao Zhang 
680d460d7bfSJunchao Zhang       // Record for reuse
681d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
682d460d7bfSJunchao Zhang       fs->diag_h   = D;
683d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
684d460d7bfSJunchao Zhang     }
685d460d7bfSJunchao Zhang     // Copy the value
686d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
687d460d7bfSJunchao Zhang     D   = fs->diag_h;
688d460d7bfSJunchao Zhang     Mnz = Ai[m];
689d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
690*421480d9SBarry Smith       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
691d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
692d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
693d460d7bfSJunchao Zhang     }
694d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
695d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
696d460d7bfSJunchao Zhang 
697204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
698204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) {
699204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
700204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701204a0e31SJunchao Zhang     } else
702204a0e31SJunchao Zhang   #endif
703204a0e31SJunchao Zhang     {
704d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
705d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
706d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
707204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis = PETSC_TRUE;
708204a0e31SJunchao Zhang     }
709d460d7bfSJunchao Zhang   }
710d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
711d460d7bfSJunchao Zhang }
712d460d7bfSJunchao Zhang 
713d460d7bfSJunchao Zhang // Solve Ut D U x = b
714d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
715d460d7bfSJunchao Zhang {
716d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
717d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
718d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
719d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
720d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
721d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
722d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
723d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
724d460d7bfSJunchao Zhang 
725d460d7bfSJunchao Zhang   PetscFunctionBegin;
726d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
727d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
728d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
729d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
730d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
731d460d7bfSJunchao Zhang 
732d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
733d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
734d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
735d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
736d460d7bfSJunchao Zhang   } else {
737d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
738d460d7bfSJunchao Zhang   }
739d460d7bfSJunchao Zhang 
740d460d7bfSJunchao Zhang   // Solve Ut Y = X
741d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
742d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
743d460d7bfSJunchao Zhang 
744d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
745d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
746d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
747d460d7bfSJunchao Zhang 
748d460d7bfSJunchao Zhang   // Solve U X = Y
749d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
750d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
751d460d7bfSJunchao Zhang   } else {
752d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
753d460d7bfSJunchao Zhang   }
754d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
755d460d7bfSJunchao Zhang 
756d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
757d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
758d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
759d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
760d460d7bfSJunchao Zhang   }
761d460d7bfSJunchao Zhang 
762d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
763d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
764d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
765d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
766d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
767d460d7bfSJunchao Zhang }
768d460d7bfSJunchao Zhang #else
769d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
770d71ae5a4SJacob Faibussowitsch {
771087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
772087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
773aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
774aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
775087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
776087f3262SPaul Mullowney   PetscScalar                       *AAUp;
777087f3262SPaul Mullowney   PetscScalar                       *AALo;
778087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
779087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
780087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
781087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
782087f3262SPaul Mullowney 
783087f3262SPaul Mullowney   PetscFunctionBegin;
7843ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
785c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
786087f3262SPaul Mullowney     try {
7879566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
789da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
790087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7919566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
793087f3262SPaul Mullowney 
794087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
795087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
796087f3262SPaul Mullowney         AiUp[n] = nzUpper;
797087f3262SPaul Mullowney         offset  = 0;
798087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
799087f3262SPaul Mullowney           /* set the pointers */
800087f3262SPaul Mullowney           v  = aa + ai[i];
801087f3262SPaul Mullowney           vj = aj + ai[i];
802087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
803087f3262SPaul Mullowney 
804087f3262SPaul Mullowney           /* first, set the diagonal elements */
805087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
80609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
807087f3262SPaul Mullowney           AiUp[i]      = offset;
80809f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
809087f3262SPaul Mullowney 
810087f3262SPaul Mullowney           offset += 1;
811087f3262SPaul Mullowney           if (nz > 0) {
812f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
813f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
814087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
815087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
816087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
817087f3262SPaul Mullowney             }
818087f3262SPaul Mullowney             offset += nz;
819087f3262SPaul Mullowney           }
820087f3262SPaul Mullowney         }
821087f3262SPaul Mullowney 
822aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8239566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
824da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
825087f3262SPaul Mullowney 
826aa372e3fSPaul Mullowney         /* Create the matrix description */
8279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8291b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
831afb2bd1cSJunchao Zhang   #else
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
833afb2bd1cSJunchao Zhang   #endif
8349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
836087f3262SPaul Mullowney 
837aa372e3fSPaul Mullowney         /* set the matrix */
838aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
839aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
841aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
842aa372e3fSPaul Mullowney 
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
845aa372e3fSPaul Mullowney 
846aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
848aa372e3fSPaul Mullowney 
849aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
850aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
851aa372e3fSPaul Mullowney 
852afb2bd1cSJunchao Zhang         /* set the operation */
853afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
854afb2bd1cSJunchao Zhang 
855afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8569566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
857261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8581b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8599371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8609371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
862afb2bd1cSJunchao Zhang   #endif
863afb2bd1cSJunchao Zhang 
864aa372e3fSPaul Mullowney         /* perform the solve analysis */
8659371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8669f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8679f7ba44dSJacob Faibussowitsch 
8689566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8699566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
870aa372e3fSPaul Mullowney 
871da79fbbcSStefano Zampini         /* assign the pointer */
872aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
873aa372e3fSPaul Mullowney 
874aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8759566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
876da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
877aa372e3fSPaul Mullowney 
878aa372e3fSPaul Mullowney         /* Create the matrix description */
8799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8811b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
883afb2bd1cSJunchao Zhang   #else
8849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
885afb2bd1cSJunchao Zhang   #endif
8869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
888aa372e3fSPaul Mullowney 
889aa372e3fSPaul Mullowney         /* set the operation */
890aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
891aa372e3fSPaul Mullowney 
892aa372e3fSPaul Mullowney         /* set the matrix */
893aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
894aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
895aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
896aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
897aa372e3fSPaul Mullowney 
898aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
900aa372e3fSPaul Mullowney 
901aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
903aa372e3fSPaul Mullowney 
904aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
906aa372e3fSPaul Mullowney 
907afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9089566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
909261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9101b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9119371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9129371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9139566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
914afb2bd1cSJunchao Zhang   #endif
915afb2bd1cSJunchao Zhang 
916aa372e3fSPaul Mullowney         /* perform the solve analysis */
9179371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9189f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9199f7ba44dSJacob Faibussowitsch 
9209566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9219566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
922aa372e3fSPaul Mullowney 
923da79fbbcSStefano Zampini         /* assign the pointer */
924aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
925087f3262SPaul Mullowney 
9269566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9279566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
929da79fbbcSStefano Zampini       } else {
930da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
931da79fbbcSStefano Zampini         offset = 0;
932da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
933da79fbbcSStefano Zampini           /* set the pointers */
934da79fbbcSStefano Zampini           v  = aa + ai[i];
935da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
936da79fbbcSStefano Zampini 
937da79fbbcSStefano Zampini           /* first, set the diagonal elements */
938da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
939da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
940da79fbbcSStefano Zampini 
941da79fbbcSStefano Zampini           offset += 1;
942da79fbbcSStefano Zampini           if (nz > 0) {
943f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
944da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
945da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
946da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
947da79fbbcSStefano Zampini             }
948da79fbbcSStefano Zampini             offset += nz;
949da79fbbcSStefano Zampini           }
950da79fbbcSStefano Zampini         }
95128b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
95228b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
953da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
954da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9559566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
956da79fbbcSStefano Zampini       }
9579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
959d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
960d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
961d71ae5a4SJacob Faibussowitsch     }
962087f3262SPaul Mullowney   }
9633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
964087f3262SPaul Mullowney }
965d460d7bfSJunchao Zhang #endif
966087f3262SPaul Mullowney 
967d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
968d71ae5a4SJacob Faibussowitsch {
969087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
970087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
971087f3262SPaul Mullowney   IS                            ip                 = a->row;
972087f3262SPaul Mullowney   PetscBool                     perm_identity;
973087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
974087f3262SPaul Mullowney 
975087f3262SPaul Mullowney   PetscFunctionBegin;
97628b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
977d460d7bfSJunchao Zhang 
978b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
979d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
980d460d7bfSJunchao Zhang #else
9819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
982ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
983d460d7bfSJunchao Zhang #endif
984aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
985aa372e3fSPaul Mullowney 
986da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
987da79fbbcSStefano Zampini 
988087f3262SPaul Mullowney   /* lower triangular indices */
9899566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
990087f3262SPaul Mullowney   if (!perm_identity) {
9914e4bbfaaSStefano Zampini     IS              iip;
992da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9934e4bbfaaSStefano Zampini 
9949566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
9959566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
9969566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
997aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
998aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
999aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10004e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10019566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10029566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10039566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10049566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1005da79fbbcSStefano Zampini   }
10063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1007087f3262SPaul Mullowney }
1008087f3262SPaul Mullowney 
1009d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1010d71ae5a4SJacob Faibussowitsch {
1011087f3262SPaul Mullowney   PetscFunctionBegin;
10129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10139566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1014ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1015d460d7bfSJunchao Zhang 
1016b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1017d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1018d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1019d460d7bfSJunchao Zhang #else
1020087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1021d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1022d460d7bfSJunchao Zhang   IS          ip = b->row;
1023d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1024d460d7bfSJunchao Zhang 
10259566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1026087f3262SPaul Mullowney   if (perm_identity) {
1027087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1028087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1029087f3262SPaul Mullowney   } else {
1030087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1031087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1032d460d7bfSJunchao Zhang   }
1033d460d7bfSJunchao Zhang #endif
10344e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10354e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1036087f3262SPaul Mullowney 
1037087f3262SPaul Mullowney   /* get the triangular factors */
10389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1040087f3262SPaul Mullowney }
10419ae82921SPaul Mullowney 
1042b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1043d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1044d71ae5a4SJacob Faibussowitsch {
1045bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1046aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1047aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1048da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1049da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1050aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1051aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1052aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1053aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1054b175d8bbSPaul Mullowney 
1055bda325fcSPaul Mullowney   PetscFunctionBegin;
1056aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10579566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1058da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1059aa372e3fSPaul Mullowney 
1060aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1061aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1062aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10639371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1064aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1065aa372e3fSPaul Mullowney 
1066aa372e3fSPaul Mullowney   /* Create the matrix description */
10679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10699566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10709566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1072aa372e3fSPaul Mullowney 
1073aa372e3fSPaul Mullowney   /* set the operation */
1074aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1075aa372e3fSPaul Mullowney 
1076aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1077aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1078afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1079afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1080aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1081afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1082afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1083afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1084aa372e3fSPaul Mullowney 
1085aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1086afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10879371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10889371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10899371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10909566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1091afb2bd1cSJunchao Zhang   #endif
1092afb2bd1cSJunchao Zhang 
10939566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10949f7ba44dSJacob Faibussowitsch   {
10959f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
10969f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
10979371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1098afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10999f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1100afb2bd1cSJunchao Zhang   #else
11019f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1102afb2bd1cSJunchao Zhang   #endif
11039f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11049f7ba44dSJacob Faibussowitsch   }
11059f7ba44dSJacob Faibussowitsch 
11069566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11079566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1108aa372e3fSPaul Mullowney 
1109afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11109566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1111261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11121b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11139371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11149371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1116afb2bd1cSJunchao Zhang   #endif
1117afb2bd1cSJunchao Zhang 
1118afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11199371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11209f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11219f7ba44dSJacob Faibussowitsch 
11229566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11239566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1124aa372e3fSPaul Mullowney 
1125da79fbbcSStefano Zampini   /* assign the pointer */
1126aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1127aa372e3fSPaul Mullowney 
1128aa372e3fSPaul Mullowney   /*********************************************/
1129aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1130aa372e3fSPaul Mullowney   /*********************************************/
1131aa372e3fSPaul Mullowney 
1132aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11339566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1134da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1135aa372e3fSPaul Mullowney 
1136aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1137aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1138aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11399371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1140aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1141aa372e3fSPaul Mullowney 
1142aa372e3fSPaul Mullowney   /* Create the matrix description */
11439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1148aa372e3fSPaul Mullowney 
1149aa372e3fSPaul Mullowney   /* set the operation */
1150aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1151aa372e3fSPaul Mullowney 
1152aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1153aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1154afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1155afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1156aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1157afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1158afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1159afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1160aa372e3fSPaul Mullowney 
1161aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1162afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11639371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11649371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11659371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11669566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1167afb2bd1cSJunchao Zhang   #endif
1168afb2bd1cSJunchao Zhang 
11699566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11709f7ba44dSJacob Faibussowitsch   {
11719f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11729f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11739371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1174afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11759f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1176afb2bd1cSJunchao Zhang   #else
11779f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1178afb2bd1cSJunchao Zhang   #endif
11799f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11809f7ba44dSJacob Faibussowitsch   }
1181d49cd2b7SBarry Smith 
11829566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1184aa372e3fSPaul Mullowney 
1185afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11869566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1187261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11881b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11899371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11909371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11919566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1192afb2bd1cSJunchao Zhang   #endif
1193afb2bd1cSJunchao Zhang 
1194afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11955f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
11969371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11979f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1198d49cd2b7SBarry Smith 
11999566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12009566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1201aa372e3fSPaul Mullowney 
1202da79fbbcSStefano Zampini   /* assign the pointer */
1203aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1205bda325fcSPaul Mullowney }
1206d460d7bfSJunchao Zhang #endif
1207bda325fcSPaul Mullowney 
12089371c9d4SSatish Balay struct PetscScalarToPetscInt {
12099371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1210a49f1ed0SStefano Zampini };
1211a49f1ed0SStefano Zampini 
1212d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1213d71ae5a4SJacob Faibussowitsch {
1214aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1215a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1216bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1217bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1218aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1219b175d8bbSPaul Mullowney 
1220bda325fcSPaul Mullowney   PetscFunctionBegin;
12219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1222a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122328b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1224a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
122508401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12263ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12279566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
122948a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1230a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12329566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1233aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12349566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12359566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1236aa372e3fSPaul Mullowney 
1237b06137fdSPaul Mullowney     /* set alpha and beta */
1238f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1239f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1240f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1244b06137fdSPaul Mullowney 
1245aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1247a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1248554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1249554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1250aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1251a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1252aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1254a3fdcf43SKarl Rupp 
1255ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
125681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1257afb2bd1cSJunchao Zhang 
1258afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12593606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12609371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12619371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12629371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12633606e59fSJunchao Zhang   #else
12643606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12653606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12663606e59fSJunchao Zhang 
12673606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12683606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12693606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12703606e59fSJunchao Zhang         */
12713606e59fSJunchao Zhang       if (matrixT->num_entries) {
12729371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12739371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12743606e59fSJunchao Zhang 
12753606e59fSJunchao Zhang       } else {
12763606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12773606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12783606e59fSJunchao Zhang       }
12793606e59fSJunchao Zhang   #endif
1280afb2bd1cSJunchao Zhang #endif
1281aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1282afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1283afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1284afb2bd1cSJunchao Zhang #else
1285aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
128651c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
128751c6d536SStefano Zampini       /* First convert HYB to CSR */
1288aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1289aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1290aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1291aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1292aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1293aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1294aa372e3fSPaul Mullowney 
12959371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
12969371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1297aa372e3fSPaul Mullowney 
1298aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1299aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1300aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1301aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1302aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1303aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1304aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1305aa372e3fSPaul Mullowney 
13069371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13079371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13089371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1309aa372e3fSPaul Mullowney 
1310aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1311aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13129566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13139371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13149371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13159371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1316aa372e3fSPaul Mullowney 
1317aa372e3fSPaul Mullowney       /* assign the pointer */
1318aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13191a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1320aa372e3fSPaul Mullowney       /* delete temporaries */
1321aa372e3fSPaul Mullowney       if (tempT) {
1322aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1323aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1324aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1325aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1326087f3262SPaul Mullowney       }
1327aa372e3fSPaul Mullowney       if (temp) {
1328aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1329aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1330aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1331aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1332aa372e3fSPaul Mullowney       }
1333afb2bd1cSJunchao Zhang #endif
1334aa372e3fSPaul Mullowney     }
1335a49f1ed0SStefano Zampini   }
1336a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1337a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1338a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
133928b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
134028b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
134128b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
134228b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134328b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134428b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
134528b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1347a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1348a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1349a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13509566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1351a49f1ed0SStefano Zampini     }
1352a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1353a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1354792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1355a49f1ed0SStefano Zampini 
1356a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1357a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1358a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1359a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13609371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13619371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13629371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13639566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1364a49f1ed0SStefano Zampini #endif
1365a49f1ed0SStefano Zampini 
13661a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13671a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13681a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13691a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13701a2c6b5cSJunchao Zhang 
13711a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13721a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13731a2c6b5cSJunchao Zhang         */
13749371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1375a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13769371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13779371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1378a49f1ed0SStefano Zampini #else
13799371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13809371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1381a49f1ed0SStefano Zampini #endif
13821a2c6b5cSJunchao Zhang       } else {
13831a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13841a2c6b5cSJunchao Zhang       }
13851a2c6b5cSJunchao Zhang 
1386a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1387792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1388a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1390a49f1ed0SStefano Zampini #endif
1391a49f1ed0SStefano Zampini     }
13929371c9d4SSatish Balay     PetscCallThrust(
13939371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1394a49f1ed0SStefano Zampini   }
13959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13969566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1397213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1398213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1399aa372e3fSPaul Mullowney   /* assign the pointer */
1400aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14011a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1403bda325fcSPaul Mullowney }
1404bda325fcSPaul Mullowney 
1405b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1406d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1407d460d7bfSJunchao Zhang {
1408d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1409d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1410d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1411d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1412d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1413d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1414d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1415d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1416d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1417d460d7bfSJunchao Zhang 
1418d460d7bfSJunchao Zhang   PetscFunctionBegin;
1419d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1420d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1421d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1422d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1423d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1424d460d7bfSJunchao Zhang 
1425d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1426d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1427d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1428d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1429d460d7bfSJunchao Zhang   } else {
1430d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1431d460d7bfSJunchao Zhang   }
1432d460d7bfSJunchao Zhang 
1433d460d7bfSJunchao Zhang   // Solve L Y = X
1434d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1435d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1436d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1437d460d7bfSJunchao Zhang 
1438d460d7bfSJunchao Zhang   // Solve U X = Y
1439d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1440d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1441d460d7bfSJunchao Zhang   } else {
1442d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1443d460d7bfSJunchao Zhang   }
1444d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1445d460d7bfSJunchao Zhang 
1446d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1447d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1448d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1449d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1450d460d7bfSJunchao Zhang   }
1451d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1452d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1453d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1454d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1455d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1456d460d7bfSJunchao Zhang }
1457d460d7bfSJunchao Zhang 
1458d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1459d460d7bfSJunchao Zhang {
1460d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1461d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1462d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1463d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1464d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1465d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1466d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1467d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1468d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1469d460d7bfSJunchao Zhang 
1470d460d7bfSJunchao Zhang   PetscFunctionBegin;
1471d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1472d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1473d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1474d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1475d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1476d460d7bfSJunchao Zhang 
1477d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1478d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1479d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1480d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1481d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1482d460d7bfSJunchao Zhang   }
1483d460d7bfSJunchao Zhang 
1484d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1485d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1486d460d7bfSJunchao Zhang 
1487d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1488d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1489d460d7bfSJunchao Zhang   }
1490d460d7bfSJunchao Zhang 
1491d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1492d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1493d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1494d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1495d460d7bfSJunchao Zhang 
1496d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1497d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1498d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1499d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1500d460d7bfSJunchao Zhang   } else {
1501d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1502d460d7bfSJunchao Zhang   }
1503d460d7bfSJunchao Zhang 
1504d460d7bfSJunchao Zhang   // Solve Ut Y = X
1505d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1506d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1507d460d7bfSJunchao Zhang 
1508d460d7bfSJunchao Zhang   // Solve Lt X = Y
1509d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1510d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1511d460d7bfSJunchao Zhang   } else {
1512d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1513d460d7bfSJunchao Zhang   }
1514d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1515d460d7bfSJunchao Zhang 
1516d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1517d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1518d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1519d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1520d460d7bfSJunchao Zhang   }
1521d460d7bfSJunchao Zhang 
1522d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1523d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1524d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1525d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1526d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1527d460d7bfSJunchao Zhang }
1528d460d7bfSJunchao Zhang #else
1529a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1530d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1531d71ae5a4SJacob Faibussowitsch {
1532c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1533465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1534465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1535465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1536465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1537bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1538aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1539aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1540aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1541bda325fcSPaul Mullowney 
1542bda325fcSPaul Mullowney   PetscFunctionBegin;
1543aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1544aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15459566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1546aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1547aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1548bda325fcSPaul Mullowney   }
1549bda325fcSPaul Mullowney 
1550bda325fcSPaul Mullowney   /* Get the GPU pointers */
15519566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15529566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1553c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1554c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1555bda325fcSPaul Mullowney 
15569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1557aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15589371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1559aa372e3fSPaul Mullowney 
1560aa372e3fSPaul Mullowney   /* First, solve U */
15619f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15629f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1563aa372e3fSPaul Mullowney 
1564aa372e3fSPaul Mullowney   /* Then, solve L */
15659f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15669f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1567aa372e3fSPaul Mullowney 
1568aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15699371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1570aa372e3fSPaul Mullowney 
1571aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1572a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1573bda325fcSPaul Mullowney 
1574bda325fcSPaul Mullowney   /* restore */
15759566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15769566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1580bda325fcSPaul Mullowney }
1581bda325fcSPaul Mullowney 
1582d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1583d71ae5a4SJacob Faibussowitsch {
1584465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1585465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1586bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1587aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1588aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1589aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1590bda325fcSPaul Mullowney 
1591bda325fcSPaul Mullowney   PetscFunctionBegin;
1592aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1593aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1595aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1596aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1597bda325fcSPaul Mullowney   }
1598bda325fcSPaul Mullowney 
1599bda325fcSPaul Mullowney   /* Get the GPU pointers */
16009566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16019566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1602bda325fcSPaul Mullowney 
16039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1604aa372e3fSPaul Mullowney   /* First, solve U */
16059f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16069f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1607aa372e3fSPaul Mullowney 
1608aa372e3fSPaul Mullowney   /* Then, solve L */
16099f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16109f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1611bda325fcSPaul Mullowney 
1612bda325fcSPaul Mullowney   /* restore */
16139566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16149566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1618bda325fcSPaul Mullowney }
1619bda325fcSPaul Mullowney 
1620d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1621d71ae5a4SJacob Faibussowitsch {
1622465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1623465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1624465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1625465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16269ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1627aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1628aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1629aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16309ae82921SPaul Mullowney 
16319ae82921SPaul Mullowney   PetscFunctionBegin;
1632e057df02SPaul Mullowney   /* Get the GPU pointers */
16339566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16349566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1635c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1636c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16379ae82921SPaul Mullowney 
16389566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1639aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16409371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1641aa372e3fSPaul Mullowney 
1642aa372e3fSPaul Mullowney   /* Next, solve L */
16439f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16449f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1645aa372e3fSPaul Mullowney 
1646aa372e3fSPaul Mullowney   /* Then, solve U */
16479f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16489f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1649d49cd2b7SBarry Smith 
16504e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16519371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16529ae82921SPaul Mullowney 
16539566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16549566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16589ae82921SPaul Mullowney }
16599ae82921SPaul Mullowney 
1660d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1661d71ae5a4SJacob Faibussowitsch {
1662465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1663465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1665aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1666aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1667aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16689ae82921SPaul Mullowney 
16699ae82921SPaul Mullowney   PetscFunctionBegin;
1670e057df02SPaul Mullowney   /* Get the GPU pointers */
16719566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16729566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16739ae82921SPaul Mullowney 
16749566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1675aa372e3fSPaul Mullowney   /* First, solve L */
16769f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16779f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1678d49cd2b7SBarry Smith 
1679aa372e3fSPaul Mullowney   /* Next, solve U */
16809f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16819f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16829ae82921SPaul Mullowney 
16839566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16849566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16889ae82921SPaul Mullowney }
1689d460d7bfSJunchao Zhang #endif
16909ae82921SPaul Mullowney 
1691b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
16928eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1693d71ae5a4SJacob Faibussowitsch {
1694da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1695da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1696da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1697da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1698da112707SJunchao Zhang   PetscInt                      m, nz;
1699da112707SJunchao Zhang   PetscBool                     flg;
1700da112707SJunchao Zhang 
1701da112707SJunchao Zhang   PetscFunctionBegin;
1702da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1703da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1704da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1705da112707SJunchao Zhang   }
1706da112707SJunchao Zhang 
1707da112707SJunchao Zhang   /* Copy A's value to fact */
1708da112707SJunchao Zhang   m  = fact->rmap->n;
1709da112707SJunchao Zhang   nz = aij->nz;
1710da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1711da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1712da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1713da112707SJunchao Zhang 
1714bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1715da112707SJunchao Zhang   /* Factorize fact inplace */
17169371c9d4SSatish Balay   if (m)
17179371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1718d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1719da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1720da112707SJunchao Zhang     int              numerical_zero;
1721da112707SJunchao Zhang     cusparseStatus_t status;
1722da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1723da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1724da112707SJunchao Zhang   }
1725da112707SJunchao Zhang 
1726204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1727204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1728204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1729204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1730204a0e31SJunchao Zhang   } else
1731204a0e31SJunchao Zhang   #endif
1732204a0e31SJunchao Zhang   {
173312ba2bc6SJunchao Zhang     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173412ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173512ba2bc6SJunchao Zhang     */
17369371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1737da112707SJunchao Zhang 
17389371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1739da112707SJunchao Zhang 
1740204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
174112ba2bc6SJunchao Zhang     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
174212ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1743204a0e31SJunchao Zhang   }
174412ba2bc6SJunchao Zhang 
1745da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1746d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1747d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1748da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1749da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1750bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1751da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1753da112707SJunchao Zhang }
1754da112707SJunchao Zhang 
17558eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1756d71ae5a4SJacob Faibussowitsch {
1757da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1758da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1759da112707SJunchao Zhang   PetscInt                      m, nz;
1760da112707SJunchao Zhang 
1761da112707SJunchao Zhang   PetscFunctionBegin;
1762da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1763*421480d9SBarry Smith     PetscBool flg, diagDense;
1764da112707SJunchao Zhang 
1765da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1766da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1767da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1768*421480d9SBarry Smith     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1769*421480d9SBarry Smith     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1770da112707SJunchao Zhang   }
1771da112707SJunchao Zhang 
1772da112707SJunchao Zhang   /* Free the old stale stuff */
1773da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1774da112707SJunchao Zhang 
1775da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1776da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1777da112707SJunchao Zhang    */
1778da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1779da112707SJunchao Zhang 
1780da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1781da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1782da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1783da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1784da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1785da112707SJunchao Zhang 
1786da112707SJunchao Zhang   aij->row = NULL;
1787da112707SJunchao Zhang   aij->col = NULL;
1788da112707SJunchao Zhang 
1789da112707SJunchao Zhang   /* ====================================================================== */
1790da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1791da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1792da112707SJunchao Zhang   /* ====================================================================== */
1793da112707SJunchao Zhang   const int *Ai, *Aj;
1794da112707SJunchao Zhang 
1795da112707SJunchao Zhang   m  = fact->rmap->n;
1796da112707SJunchao Zhang   nz = aij->nz;
1797da112707SJunchao Zhang 
1798f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1799f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1800f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1801d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1802d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1803d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1804da112707SJunchao Zhang 
1805da112707SJunchao Zhang   /* ====================================================================== */
1806da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1807da112707SJunchao Zhang   /* ====================================================================== */
1808da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1809da112707SJunchao Zhang   cusparseDiagType_t diagType;
1810da112707SJunchao Zhang 
1811da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1812da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1813da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1814da112707SJunchao Zhang 
1815da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1816da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1817da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1818da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1819da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1820da112707SJunchao Zhang   */
1821da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1822da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1823d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18249371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826da112707SJunchao Zhang 
1827da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1828da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1829d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18319371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1832da112707SJunchao Zhang 
1833da112707SJunchao Zhang   /* ========================================================================= */
1834da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1835da112707SJunchao Zhang   /* ========================================================================= */
1836da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18379371c9d4SSatish Balay   if (m)
18389371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1839d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1840da112707SJunchao Zhang 
1841da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1842da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1843da112707SJunchao Zhang 
1844da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1845da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1846da112707SJunchao Zhang 
1847da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18489371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1849da112707SJunchao Zhang 
1850da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18519371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1852da112707SJunchao Zhang 
1853da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
185412ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
185512ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
185612ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1857da112707SJunchao Zhang    */
185812ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
185912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
186012ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1861da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
186212ba2bc6SJunchao Zhang   } else {
186312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
186412ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1865da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
186612ba2bc6SJunchao Zhang   }
1867da112707SJunchao Zhang 
1868da112707SJunchao Zhang   /* ========================================================================== */
1869da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1870da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1871da112707SJunchao Zhang   /* ========================================================================== */
1872da112707SJunchao Zhang   int              structural_zero;
1873da112707SJunchao Zhang   cusparseStatus_t status;
1874da112707SJunchao Zhang 
1875da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18769371c9d4SSatish Balay   if (m)
18779371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1878d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1879da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
188046aba097SBarry Smith     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1881da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1882da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1883da112707SJunchao Zhang   }
1884da112707SJunchao Zhang 
1885da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18860dd8c0acSJunchao Zhang   {
1887da112707SJunchao Zhang     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1888*421480d9SBarry Smith     PetscInt       *Ai, nzRow, nzLeft;
1889*421480d9SBarry Smith     const PetscInt *adiag;
1890da112707SJunchao Zhang     PetscLogDouble  flops = 0.0;
1891da112707SJunchao Zhang 
1892*421480d9SBarry Smith     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1893da112707SJunchao Zhang     Ai = Aseq->i;
1894da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1895*421480d9SBarry Smith       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1896da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1897*421480d9SBarry Smith         nzLeft = adiag[i] - Ai[i];
1898da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1899da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1900da112707SJunchao Zhang         */
1901da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1902da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1903da112707SJunchao Zhang       }
1904da112707SJunchao Zhang     }
1905da112707SJunchao Zhang     fs->numericFactFlops = flops;
19060dd8c0acSJunchao Zhang   }
1907da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1909da112707SJunchao Zhang }
1910da112707SJunchao Zhang 
1911d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1912d71ae5a4SJacob Faibussowitsch {
1913da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1914da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1915da112707SJunchao Zhang   const PetscScalar            *barray;
1916da112707SJunchao Zhang   PetscScalar                  *xarray;
1917da112707SJunchao Zhang 
1918da112707SJunchao Zhang   PetscFunctionBegin;
1919da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1920da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1921da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1922da112707SJunchao Zhang 
1923da112707SJunchao Zhang   /* Solve L*y = b */
1924da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1925da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19269371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19279371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1928da112707SJunchao Zhang 
1929da112707SJunchao Zhang   /* Solve Lt*x = y */
1930da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19319371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19329371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1933da112707SJunchao Zhang 
1934da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1935da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1936da112707SJunchao Zhang 
1937da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1938da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1940da112707SJunchao Zhang }
1941da112707SJunchao Zhang 
19428eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1943d71ae5a4SJacob Faibussowitsch {
1944da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1945da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1946da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1947da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1948da112707SJunchao Zhang   PetscInt                      m, nz;
1949da112707SJunchao Zhang   PetscBool                     flg;
1950da112707SJunchao Zhang 
1951da112707SJunchao Zhang   PetscFunctionBegin;
1952da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1953da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1954da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1955da112707SJunchao Zhang   }
1956da112707SJunchao Zhang 
1957da112707SJunchao Zhang   /* Copy A's value to fact */
1958da112707SJunchao Zhang   m  = fact->rmap->n;
1959da112707SJunchao Zhang   nz = aij->nz;
1960da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1961da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1962da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1963da112707SJunchao Zhang 
1964da112707SJunchao Zhang   /* Factorize fact inplace */
1965da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
196646aba097SBarry Smith      csric02() only takes the lower triangular part of matrix A to perform factorization.
1967da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1968da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1969da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1970da112707SJunchao Zhang    */
1971d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1972da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1973da112707SJunchao Zhang     int              numerical_zero;
1974da112707SJunchao Zhang     cusparseStatus_t status;
1975da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1976da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1977da112707SJunchao Zhang   }
1978da112707SJunchao Zhang 
1979204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1980204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1981204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1982204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1983204a0e31SJunchao Zhang   } else
1984204a0e31SJunchao Zhang   #endif
1985204a0e31SJunchao Zhang   {
19869371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1987da112707SJunchao Zhang 
1988da112707SJunchao Zhang     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1989da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1990da112707SJunchao Zhang   */
19919371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1992204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
1993204a0e31SJunchao Zhang   }
1994da112707SJunchao Zhang 
1995da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1996da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1997da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1998da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1999da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
2000da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
20013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2002da112707SJunchao Zhang }
2003da112707SJunchao Zhang 
20048eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2005d71ae5a4SJacob Faibussowitsch {
2006da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2007da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2008da112707SJunchao Zhang   PetscInt                      m, nz;
2009da112707SJunchao Zhang 
2010da112707SJunchao Zhang   PetscFunctionBegin;
2011da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2012*421480d9SBarry Smith     PetscBool flg, diagDense;
2013da112707SJunchao Zhang 
2014da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2015da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2016da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2017*421480d9SBarry Smith     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2018*421480d9SBarry Smith     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2019da112707SJunchao Zhang   }
2020da112707SJunchao Zhang 
2021da112707SJunchao Zhang   /* Free the old stale stuff */
2022da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2023da112707SJunchao Zhang 
2024da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2025da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2026da112707SJunchao Zhang    */
2027da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2028da112707SJunchao Zhang 
2029da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2030da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2031da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2032da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2033da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2034da112707SJunchao Zhang 
2035da112707SJunchao Zhang   aij->row = NULL;
2036da112707SJunchao Zhang   aij->col = NULL;
2037da112707SJunchao Zhang 
2038da112707SJunchao Zhang   /* ====================================================================== */
2039da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2040da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2041da112707SJunchao Zhang   /* ====================================================================== */
2042da112707SJunchao Zhang   const int *Ai, *Aj;
2043da112707SJunchao Zhang 
2044da112707SJunchao Zhang   m  = fact->rmap->n;
2045da112707SJunchao Zhang   nz = aij->nz;
2046da112707SJunchao Zhang 
2047f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2048f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2049da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2050da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2051d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2052d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2053da112707SJunchao Zhang 
2054da112707SJunchao Zhang   /* ====================================================================== */
2055da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2056da112707SJunchao Zhang   /* ====================================================================== */
2057da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2058da112707SJunchao Zhang   cusparseDiagType_t diagType;
2059da112707SJunchao Zhang 
2060da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2061da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2062da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2063da112707SJunchao Zhang 
2064da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2065da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2066da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2067da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2068da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2069da112707SJunchao Zhang   */
2070da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2071da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2072d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20749371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2075da112707SJunchao Zhang 
2076da112707SJunchao Zhang   /* ========================================================================= */
2077da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2078da112707SJunchao Zhang   /* ========================================================================= */
2079da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2080d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2081da112707SJunchao Zhang 
2082da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2083da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2084da112707SJunchao Zhang 
2085da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2086da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2087da112707SJunchao Zhang 
2088da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20899371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2090da112707SJunchao Zhang 
2091da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20929371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2093da112707SJunchao Zhang 
209412ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
209512ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
209612ba2bc6SJunchao Zhang    */
209712ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
209812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
209912ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2100da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
210112ba2bc6SJunchao Zhang   } else {
210212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
210312ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
210412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
210512ba2bc6SJunchao Zhang   }
2106da112707SJunchao Zhang 
2107da112707SJunchao Zhang   /* ========================================================================== */
2108da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2109da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2110da112707SJunchao Zhang   /* ========================================================================== */
2111da112707SJunchao Zhang   int              structural_zero;
2112da112707SJunchao Zhang   cusparseStatus_t status;
2113da112707SJunchao Zhang 
2114da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2115d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2116da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
211746aba097SBarry Smith     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2118da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2119da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2120da112707SJunchao Zhang   }
2121da112707SJunchao Zhang 
2122da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21230dd8c0acSJunchao Zhang   {
2124da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21250dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2126da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2127da112707SJunchao Zhang 
2128da112707SJunchao Zhang     Ai = Aseq->i;
2129da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2130da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2131da112707SJunchao Zhang       if (nzRow > 1) {
2132da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2133da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2134da112707SJunchao Zhang         */
2135da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2136da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2137da112707SJunchao Zhang       }
2138da112707SJunchao Zhang     }
2139da112707SJunchao Zhang     fs->numericFactFlops = flops;
21400dd8c0acSJunchao Zhang   }
2141da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2143da112707SJunchao Zhang }
2144da112707SJunchao Zhang #endif
2145da112707SJunchao Zhang 
2146d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2147d460d7bfSJunchao Zhang {
2148b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2149b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2150d460d7bfSJunchao Zhang 
2151d460d7bfSJunchao Zhang   PetscFunctionBegin;
2152d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2153d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2154d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2155d460d7bfSJunchao Zhang 
2156d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2157b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2158d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2159d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2160d460d7bfSJunchao Zhang #else
2161d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2162d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2163d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2164d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2165d460d7bfSJunchao Zhang 
2166d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2167d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2168d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2169d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2170d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2171d460d7bfSJunchao Zhang     } else {
2172d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2173d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2174d460d7bfSJunchao Zhang     }
2175d460d7bfSJunchao Zhang #endif
2176d460d7bfSJunchao Zhang   }
2177d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2178d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2179d460d7bfSJunchao Zhang 
2180d460d7bfSJunchao Zhang   /* get the triangular factors */
2181d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2182d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2183d460d7bfSJunchao Zhang }
2184d460d7bfSJunchao Zhang 
2185d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186d460d7bfSJunchao Zhang {
2187d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2188d460d7bfSJunchao Zhang 
2189d460d7bfSJunchao Zhang   PetscFunctionBegin;
2190d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2191d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2192d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2193d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2194d460d7bfSJunchao Zhang }
2195d460d7bfSJunchao Zhang 
2196d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2197d71ae5a4SJacob Faibussowitsch {
2198da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2199da112707SJunchao Zhang 
2200da112707SJunchao Zhang   PetscFunctionBegin;
2201b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2202bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2203f82ac72cSJunchao Zhang   if (!info->factoronhost) {
2204da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2205da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2206bc996fdcSJunchao Zhang   }
2207da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2208da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2209da112707SJunchao Zhang   } else
2210da112707SJunchao Zhang #endif
2211da112707SJunchao Zhang   {
2212da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2213da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2214da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2215da112707SJunchao Zhang   }
22163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2217da112707SJunchao Zhang }
2218da112707SJunchao Zhang 
2219d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2220d71ae5a4SJacob Faibussowitsch {
2221da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2222da112707SJunchao Zhang 
2223da112707SJunchao Zhang   PetscFunctionBegin;
2224b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2225bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2226f82ac72cSJunchao Zhang   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2227da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2228da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2229da112707SJunchao Zhang   } else
2230da112707SJunchao Zhang #endif
2231da112707SJunchao Zhang   {
2232da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2233da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2234da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2235da112707SJunchao Zhang   }
22363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2237da112707SJunchao Zhang }
2238da112707SJunchao Zhang 
2239d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2240d71ae5a4SJacob Faibussowitsch {
2241da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2242da112707SJunchao Zhang 
2243da112707SJunchao Zhang   PetscFunctionBegin;
2244da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2245da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2246da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2248da112707SJunchao Zhang }
2249da112707SJunchao Zhang 
225066976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2251d71ae5a4SJacob Faibussowitsch {
2252841d4cb1SJunchao Zhang   PetscFunctionBegin;
2253841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2255841d4cb1SJunchao Zhang }
2256841d4cb1SJunchao Zhang 
2257841d4cb1SJunchao Zhang /*MC
2258841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
225911a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2260841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2261841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
226211a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2263841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2264841d4cb1SJunchao Zhang 
2265841d4cb1SJunchao Zhang   Level: beginner
2266841d4cb1SJunchao Zhang 
22671cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22682ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2269841d4cb1SJunchao Zhang M*/
2270841d4cb1SJunchao Zhang 
2271d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2272d71ae5a4SJacob Faibussowitsch {
2273841d4cb1SJunchao Zhang   PetscInt n = A->rmap->n;
2274841d4cb1SJunchao Zhang 
2275841d4cb1SJunchao Zhang   PetscFunctionBegin;
2276841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2277841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2278b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2279841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2280841d4cb1SJunchao Zhang 
2281841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2282841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2283841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2284841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2285841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2286841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2287841d4cb1SJunchao Zhang     } else {
2288841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2289841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2290841d4cb1SJunchao Zhang     }
2291841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2292841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2293841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2294841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2295841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2296841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2297841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2298841d4cb1SJunchao Zhang     } else {
2299841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2300841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2301841d4cb1SJunchao Zhang     }
2302841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2303841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2304841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2305841d4cb1SJunchao Zhang 
2306841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2307841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2308f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2310841d4cb1SJunchao Zhang }
2311841d4cb1SJunchao Zhang 
2312d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2313d71ae5a4SJacob Faibussowitsch {
23147e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23157e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2316b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2317da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23180dd8c0acSJunchao Zhang #endif
23197e8381f9SStefano Zampini 
23207e8381f9SStefano Zampini   PetscFunctionBegin;
23217e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23229566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2323da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2324da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23259566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2326da112707SJunchao Zhang     }
2327b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2328da112707SJunchao Zhang     else if (fs->csrVal) {
2329da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2330da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331da112707SJunchao Zhang     }
2332da112707SJunchao Zhang #endif
23339371c9d4SSatish Balay     else
23349371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23359566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23369566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23377e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23387e8381f9SStefano Zampini   }
23393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23407e8381f9SStefano Zampini }
23417e8381f9SStefano Zampini 
2342d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2343d71ae5a4SJacob Faibussowitsch {
23447e8381f9SStefano Zampini   PetscFunctionBegin;
23459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
234667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
234867a45760SJunchao Zhang }
234967a45760SJunchao Zhang 
2350d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2351d71ae5a4SJacob Faibussowitsch {
235267a45760SJunchao Zhang   PetscFunctionBegin;
23537e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235467a45760SJunchao Zhang   *array         = NULL;
23553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235667a45760SJunchao Zhang }
235767a45760SJunchao Zhang 
2358d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2359d71ae5a4SJacob Faibussowitsch {
236067a45760SJunchao Zhang   PetscFunctionBegin;
23619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
236267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236467a45760SJunchao Zhang }
236567a45760SJunchao Zhang 
23668eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2367d71ae5a4SJacob Faibussowitsch {
236867a45760SJunchao Zhang   PetscFunctionBegin;
236967a45760SJunchao Zhang   *array = NULL;
23703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237167a45760SJunchao Zhang }
237267a45760SJunchao Zhang 
2373d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2374d71ae5a4SJacob Faibussowitsch {
237567a45760SJunchao Zhang   PetscFunctionBegin;
237667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237867a45760SJunchao Zhang }
237967a45760SJunchao Zhang 
2380d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2381d71ae5a4SJacob Faibussowitsch {
238267a45760SJunchao Zhang   PetscFunctionBegin;
238367a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238467a45760SJunchao Zhang   *array         = NULL;
23853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23867e8381f9SStefano Zampini }
23877e8381f9SStefano Zampini 
2388d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2389d71ae5a4SJacob Faibussowitsch {
23907ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23917ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23927ee59b9bSJunchao Zhang 
23937ee59b9bSJunchao Zhang   PetscFunctionBegin;
23947ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23957ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
23967ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
23977ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
23987ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
23997ee59b9bSJunchao Zhang 
24007ee59b9bSJunchao Zhang   if (i) {
24017ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24027ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24037ee59b9bSJunchao Zhang #else
24047ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24057ee59b9bSJunchao Zhang #endif
24067ee59b9bSJunchao Zhang   }
24077ee59b9bSJunchao Zhang   if (j) {
24087ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24097ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24107ee59b9bSJunchao Zhang #else
24117ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24127ee59b9bSJunchao Zhang #endif
24137ee59b9bSJunchao Zhang   }
24147ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24157ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24177ee59b9bSJunchao Zhang }
24187ee59b9bSJunchao Zhang 
2419d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2420d71ae5a4SJacob Faibussowitsch {
2421aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24227c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2424213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2425aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2426abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24279ae82921SPaul Mullowney 
24289ae82921SPaul Mullowney   PetscFunctionBegin;
242928b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2430c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2431a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2432a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2433afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243485ba7357SStefano Zampini 
243508401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24369566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2437afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24389566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2439f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24409566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24419566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
244234d6c7a5SJose E. Roman     } else {
2443abb89eb1SStefano Zampini       PetscInt nnz;
24449566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24477c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
244881902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2449a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2450a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24519ae82921SPaul Mullowney       try {
24529ae82921SPaul Mullowney         if (a->compressedrow.use) {
24539ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24549ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24559ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24569ae82921SPaul Mullowney         } else {
2457213423ffSJunchao Zhang           m    = A->rmap->n;
2458213423ffSJunchao Zhang           ii   = a->i;
2459e6e9a74fSStefano Zampini           ridx = NULL;
24609ae82921SPaul Mullowney         }
246108401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24629371c9d4SSatish Balay         if (!a->a) {
24639371c9d4SSatish Balay           nnz  = ii[m];
24649371c9d4SSatish Balay           both = PETSC_FALSE;
24659371c9d4SSatish Balay         } else nnz = a->nz;
246608401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24679ae82921SPaul Mullowney 
246885ba7357SStefano Zampini         /* create cusparse matrix */
2469abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2470aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24749ae82921SPaul Mullowney 
2475f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2476f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2477f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
24789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24809566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2482b06137fdSPaul Mullowney 
2483aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2484aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2485aa372e3fSPaul Mullowney           /* set the matrix */
2486afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2487afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2488afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2489abb89eb1SStefano Zampini           mat->num_entries = nnz;
2490ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2491afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2492ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2493abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2494aa372e3fSPaul Mullowney 
2495ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2496abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2497aa372e3fSPaul Mullowney 
2498aa372e3fSPaul Mullowney           /* assign the pointer */
2499afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2500afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2501afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25029371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25039371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25049371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2505afb2bd1cSJunchao Zhang           }
2506afb2bd1cSJunchao Zhang #endif
2507aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2508afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2509afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2510afb2bd1cSJunchao Zhang #else
2511afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2512afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2513afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2514abb89eb1SStefano Zampini           mat->num_entries = nnz;
2515ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2516afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2517aa372e3fSPaul Mullowney 
2518ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2519abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2520aa372e3fSPaul Mullowney 
2521ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2522abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2523aa372e3fSPaul Mullowney 
2524aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25259566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25269371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25279371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25289371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2529aa372e3fSPaul Mullowney           /* assign the pointer */
2530aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2531aa372e3fSPaul Mullowney 
2532afb2bd1cSJunchao Zhang           if (mat) {
2533afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2534afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2535afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2536afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2537087f3262SPaul Mullowney           }
2538afb2bd1cSJunchao Zhang #endif
2539087f3262SPaul Mullowney         }
2540ca45077fSPaul Mullowney 
2541aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2542213423ffSJunchao Zhang         if (a->compressedrow.use) {
2543ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2544ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2545aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2546213423ffSJunchao Zhang           tmp = m;
2547213423ffSJunchao Zhang         } else {
2548213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2549213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2550213423ffSJunchao Zhang           tmp                        = 0;
2551213423ffSJunchao Zhang         }
25529566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2553aa372e3fSPaul Mullowney 
2554aa372e3fSPaul Mullowney         /* assign the pointer */
2555aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2556d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2557d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2558d71ae5a4SJacob Faibussowitsch       }
25599566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25609566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
256134d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256234d6c7a5SJose E. Roman     }
2563abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25649ae82921SPaul Mullowney   }
25653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25669ae82921SPaul Mullowney }
25679ae82921SPaul Mullowney 
25689371c9d4SSatish Balay struct VecCUDAPlusEquals {
2569aa372e3fSPaul Mullowney   template <typename Tuple>
2570d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2571d71ae5a4SJacob Faibussowitsch   {
2572aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2573aa372e3fSPaul Mullowney   }
2574aa372e3fSPaul Mullowney };
2575aa372e3fSPaul Mullowney 
25769371c9d4SSatish Balay struct VecCUDAEquals {
25777e8381f9SStefano Zampini   template <typename Tuple>
2578d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2579d71ae5a4SJacob Faibussowitsch   {
25807e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25817e8381f9SStefano Zampini   }
25827e8381f9SStefano Zampini };
25837e8381f9SStefano Zampini 
25849371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2585e6e9a74fSStefano Zampini   template <typename Tuple>
2586d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2587d71ae5a4SJacob Faibussowitsch   {
2588e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2589e6e9a74fSStefano Zampini   }
2590e6e9a74fSStefano Zampini };
2591e6e9a74fSStefano Zampini 
2592cc1eb50dSBarry Smith struct MatProductCtx_MatMatCusparse {
2593ccdfe979SStefano Zampini   PetscBool      cisdense;
2594ccdfe979SStefano Zampini   PetscScalar   *Bt;
2595ccdfe979SStefano Zampini   Mat            X;
2596fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2597fcdce8c4SStefano Zampini   PetscLogDouble flops;
2598fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2599b4285af6SJunchao Zhang 
2600afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2601fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2602afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2603afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2604afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2605afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2606b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2607b4285af6SJunchao Zhang   void *dBuffer4;
2608b4285af6SJunchao Zhang   void *dBuffer5;
2609b4285af6SJunchao Zhang   #endif
2610fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2611fcdce8c4SStefano Zampini   void                 *mmBuffer;
2612fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2613fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2614afb2bd1cSJunchao Zhang #endif
2615afb2bd1cSJunchao Zhang };
2616ccdfe979SStefano Zampini 
2617cc1eb50dSBarry Smith static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(void **data)
2618d71ae5a4SJacob Faibussowitsch {
2619cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2620ccdfe979SStefano Zampini 
2621ccdfe979SStefano Zampini   PetscFunctionBegin;
26229566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2623fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2624afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26259566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26269566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26279566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26289566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2629b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26309566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26319566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2632b4285af6SJunchao Zhang   #endif
26339566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26349566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2635afb2bd1cSJunchao Zhang #endif
26369566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
2637cc1eb50dSBarry Smith   PetscCall(PetscFree(*data));
26383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2639ccdfe979SStefano Zampini }
2640ccdfe979SStefano Zampini 
26414742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2642ccdfe979SStefano Zampini 
2643d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2644d71ae5a4SJacob Faibussowitsch {
2645ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2646ccdfe979SStefano Zampini   Mat                           A, B;
2647afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2648ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2649ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2650ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2651ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2652ccdfe979SStefano Zampini   const PetscScalar            *barray;
2653ccdfe979SStefano Zampini   PetscScalar                  *carray;
2654cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2655ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2656ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2657ccdfe979SStefano Zampini 
2658ccdfe979SStefano Zampini   PetscFunctionBegin;
2659ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
266028b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2661cc1eb50dSBarry Smith   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2662ccdfe979SStefano Zampini   A      = product->A;
2663ccdfe979SStefano Zampini   B      = product->B;
26649566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2666ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2667ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
266828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2670ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2671ccdfe979SStefano Zampini   switch (product->type) {
2672ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2673ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2674ccdfe979SStefano Zampini     mat = cusp->mat;
2675ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2676ccdfe979SStefano Zampini     m   = A->rmap->n;
2677ccdfe979SStefano Zampini     n   = B->cmap->n;
2678ccdfe979SStefano Zampini     break;
2679ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26801a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2681e6e9a74fSStefano Zampini       mat = cusp->mat;
2682e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2683e6e9a74fSStefano Zampini     } else {
26849566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2685ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2686ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2687e6e9a74fSStefano Zampini     }
2688ccdfe979SStefano Zampini     m = A->cmap->n;
2689ccdfe979SStefano Zampini     n = B->cmap->n;
2690ccdfe979SStefano Zampini     break;
2691ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2692ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2693ccdfe979SStefano Zampini     mat = cusp->mat;
2694ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2695ccdfe979SStefano Zampini     m   = A->rmap->n;
2696ccdfe979SStefano Zampini     n   = B->rmap->n;
2697ccdfe979SStefano Zampini     break;
2698d71ae5a4SJacob Faibussowitsch   default:
2699d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2700ccdfe979SStefano Zampini   }
270128b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2702ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2703ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27049566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27059566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2706cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2707afb2bd1cSJunchao Zhang 
27089566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2709c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2710cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27119566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2712c8378d12SStefano Zampini   } else {
2713cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27149566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2715c8378d12SStefano Zampini   }
2716c8378d12SStefano Zampini 
27179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2718afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2719afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2720fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2721fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2722fe5544b9SJunchao Zhang   #else
2723fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2724fe5544b9SJunchao Zhang   #endif
2725fe5544b9SJunchao Zhang 
2726a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2727afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2728fcdce8c4SStefano Zampini     size_t mmBufferSize;
27299371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27309371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27319371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27329371c9d4SSatish Balay     }
2733afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2735afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2736afb2bd1cSJunchao Zhang     }
2737c8378d12SStefano Zampini 
27389371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27399371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27409371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27419371c9d4SSatish Balay     }
2742afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27439566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2744afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2745afb2bd1cSJunchao Zhang     }
2746afb2bd1cSJunchao Zhang 
2747fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2748fe5544b9SJunchao Zhang     if (matADescr) {
274917f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2750fe5544b9SJunchao Zhang       matADescr = NULL;
2751fe5544b9SJunchao Zhang     }
2752fe5544b9SJunchao Zhang   #endif
2753fe5544b9SJunchao Zhang 
2754fe5544b9SJunchao Zhang     if (!matADescr) {
2755fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27569371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27579371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2758afb2bd1cSJunchao Zhang     }
2759fe5544b9SJunchao Zhang 
2760fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2761fe5544b9SJunchao Zhang 
2762fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27639566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27649566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2765fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2766fcdce8c4SStefano Zampini     }
2767fe5544b9SJunchao Zhang 
2768f0b74427SPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2769fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2770fe5544b9SJunchao Zhang   #endif
2771fe5544b9SJunchao Zhang 
2772afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2773afb2bd1cSJunchao Zhang   } else {
2774afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2775fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
27769566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27779566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2778afb2bd1cSJunchao Zhang   }
2779afb2bd1cSJunchao Zhang 
2780afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2781fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2782afb2bd1cSJunchao Zhang #else
2783afb2bd1cSJunchao Zhang   PetscInt k;
2784afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2785ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2786ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2787ccdfe979SStefano Zampini     cublasStatus_t cerr;
2788ccdfe979SStefano Zampini 
27899566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27909371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27919371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2792ccdfe979SStefano Zampini     blda = B->cmap->n;
2793afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2794afb2bd1cSJunchao Zhang   } else {
2795afb2bd1cSJunchao Zhang     k = B->rmap->n;
2796ccdfe979SStefano Zampini   }
2797ccdfe979SStefano Zampini 
2798afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
27999371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
28009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2801afb2bd1cSJunchao Zhang #endif
28029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2804cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2805ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2806cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28074742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2808ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2809cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28104742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2811ccdfe979SStefano Zampini   } else {
2812cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2813ccdfe979SStefano Zampini   }
281448a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
281548a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2817ccdfe979SStefano Zampini }
2818ccdfe979SStefano Zampini 
2819d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2820d71ae5a4SJacob Faibussowitsch {
2821ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2822ccdfe979SStefano Zampini   Mat                           A, B;
2823ccdfe979SStefano Zampini   PetscInt                      m, n;
2824ccdfe979SStefano Zampini   PetscBool                     cisdense, flg;
2825cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2826ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2827ccdfe979SStefano Zampini 
2828ccdfe979SStefano Zampini   PetscFunctionBegin;
2829ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
283028b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2831ccdfe979SStefano Zampini   A = product->A;
2832ccdfe979SStefano Zampini   B = product->B;
28339566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
283428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2835ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
283608401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2837ccdfe979SStefano Zampini   switch (product->type) {
2838ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2839ccdfe979SStefano Zampini     m = A->rmap->n;
2840ccdfe979SStefano Zampini     n = B->cmap->n;
28410e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2842ccdfe979SStefano Zampini     break;
2843ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2844ccdfe979SStefano Zampini     m = A->cmap->n;
2845ccdfe979SStefano Zampini     n = B->cmap->n;
28460e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
28470e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2848ccdfe979SStefano Zampini     break;
2849ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2850ccdfe979SStefano Zampini     m = A->rmap->n;
2851ccdfe979SStefano Zampini     n = B->rmap->n;
28520e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
28530e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2854ccdfe979SStefano Zampini     break;
2855ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2856ccdfe979SStefano Zampini     m = B->cmap->n;
2857ccdfe979SStefano Zampini     n = B->cmap->n;
28580e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
28590e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2860ccdfe979SStefano Zampini     break;
2861ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2862ccdfe979SStefano Zampini     m = B->rmap->n;
2863ccdfe979SStefano Zampini     n = B->rmap->n;
28640e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
28650e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2866ccdfe979SStefano Zampini     break;
2867d71ae5a4SJacob Faibussowitsch   default:
2868d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2869ccdfe979SStefano Zampini   }
28709566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2871ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28729566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28739566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2874ccdfe979SStefano Zampini 
2875ccdfe979SStefano Zampini   /* product data */
28769566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2877ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2878afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2879afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
288048a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2881afb2bd1cSJunchao Zhang #endif
2882ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2883ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28849566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28859566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2886ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28879566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2888ccdfe979SStefano Zampini     } else {
28899566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2890ccdfe979SStefano Zampini     }
2891ccdfe979SStefano Zampini   }
2892ccdfe979SStefano Zampini   C->product->data    = mmdata;
2893cc1eb50dSBarry Smith   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2894ccdfe979SStefano Zampini 
2895ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
28963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2897ccdfe979SStefano Zampini }
2898ccdfe979SStefano Zampini 
2899d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2900d71ae5a4SJacob Faibussowitsch {
2901ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2902fcdce8c4SStefano Zampini   Mat                           A, B;
2903fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2904fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2905fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2906fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2907fcdce8c4SStefano Zampini   PetscBool                     flg;
2908fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2909fcdce8c4SStefano Zampini   MatProductType                ptype;
2910cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2911fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2912fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2913fcdce8c4SStefano Zampini #endif
2914b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2915ccdfe979SStefano Zampini 
2916ccdfe979SStefano Zampini   PetscFunctionBegin;
2917ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
291828b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29199566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
292028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2921cc1eb50dSBarry Smith   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2922fcdce8c4SStefano Zampini   A      = product->A;
2923fcdce8c4SStefano Zampini   B      = product->B;
2924fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2925fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2926fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
292708401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2928fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
292928b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2930fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
293128b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2932fcdce8c4SStefano Zampini     goto finalize;
2933fcdce8c4SStefano Zampini   }
2934fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29359566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
293628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29379566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
293828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
293928b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
294028b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2941fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2942fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2943fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
294408401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294508401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294608401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2949fcdce8c4SStefano Zampini 
2950fcdce8c4SStefano Zampini   ptype = product->type;
2951b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2952fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295328b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2954fa046f9fSJunchao Zhang   }
2955b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2956fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2958fa046f9fSJunchao Zhang   }
2959fcdce8c4SStefano Zampini   switch (ptype) {
2960fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2961fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2962fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2963fcdce8c4SStefano Zampini     break;
2964fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2965fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2966fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2967fcdce8c4SStefano Zampini     break;
2968fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2969fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2970fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2971fcdce8c4SStefano Zampini     break;
2972d71ae5a4SJacob Faibussowitsch   default:
2973d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2974fcdce8c4SStefano Zampini   }
2975fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
297628b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
297728b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
297828b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2979fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2980fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2981fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
298228b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
298328b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
298428b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2986fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2987fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29889566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2989b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29909371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29919371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2992b4285af6SJunchao Zhang   #else
29939371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29949371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29959371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29969371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2997b4285af6SJunchao Zhang   #endif
2998fcdce8c4SStefano Zampini #else
29999371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30009371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30019371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3002fcdce8c4SStefano Zampini #endif
30039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30049566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3006fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3007fcdce8c4SStefano Zampini finalize:
3008fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30099566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30109566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30119566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3012fcdce8c4SStefano Zampini   c->reallocs = 0;
3013fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3014fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3015fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3016fcdce8c4SStefano Zampini   C->num_ass++;
30173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3018ccdfe979SStefano Zampini }
3019fcdce8c4SStefano Zampini 
3020d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3021d71ae5a4SJacob Faibussowitsch {
3022fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3023fcdce8c4SStefano Zampini   Mat                           A, B;
3024fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3025fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3026fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3027fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3028fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3029fcdce8c4SStefano Zampini   PetscBool                     flg;
3030fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3031fcdce8c4SStefano Zampini   MatProductType                ptype;
3032cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
3033fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3034fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3035fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3036fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3037fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3038fcdce8c4SStefano Zampini #else
3039fcdce8c4SStefano Zampini   int cnz;
3040fcdce8c4SStefano Zampini #endif
3041b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3042fcdce8c4SStefano Zampini 
3043fcdce8c4SStefano Zampini   PetscFunctionBegin;
3044fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
304528b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3046fcdce8c4SStefano Zampini   A = product->A;
3047fcdce8c4SStefano Zampini   B = product->B;
30489566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
304928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30509566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
305128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3052fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3053fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3054fcdce8c4SStefano Zampini   /* product data */
30559566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3056fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3057cc1eb50dSBarry Smith   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3058fcdce8c4SStefano Zampini 
30599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3061d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3062d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
306308401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
306408401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3065d60bce21SJunchao Zhang 
3066fcdce8c4SStefano Zampini   ptype = product->type;
3067b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3068fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3069fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3070fa046f9fSJunchao Zhang   }
3071b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3072fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3073fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3074fa046f9fSJunchao Zhang   }
3075fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3076fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3077fcdce8c4SStefano Zampini   switch (ptype) {
3078fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3079fcdce8c4SStefano Zampini     m    = A->rmap->n;
3080fcdce8c4SStefano Zampini     n    = B->cmap->n;
3081fcdce8c4SStefano Zampini     k    = A->cmap->n;
3082fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3083fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3084fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3085fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3086fcdce8c4SStefano Zampini     break;
3087fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3088fcdce8c4SStefano Zampini     m = A->cmap->n;
3089fcdce8c4SStefano Zampini     n = B->cmap->n;
3090fcdce8c4SStefano Zampini     k = A->rmap->n;
30919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3092fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3093fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3094fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3095fcdce8c4SStefano Zampini     break;
3096fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3097fcdce8c4SStefano Zampini     m = A->rmap->n;
3098fcdce8c4SStefano Zampini     n = B->rmap->n;
3099fcdce8c4SStefano Zampini     k = A->cmap->n;
31009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3101fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3102fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3103fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3104fcdce8c4SStefano Zampini     break;
3105d71ae5a4SJacob Faibussowitsch   default:
3106d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3107fcdce8c4SStefano Zampini   }
3108fcdce8c4SStefano Zampini 
3109fcdce8c4SStefano Zampini   /* create cusparse matrix */
31109566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31119566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3112fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3113fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3114fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3115fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3116fcdce8c4SStefano Zampini 
3117fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3118fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3119fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31209566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31219566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3122fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3123fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3124fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3125fcdce8c4SStefano Zampini   } else {
3126fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3127fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3128fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3129fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3130fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3131fcdce8c4SStefano Zampini   }
3132fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3133fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3134fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3135fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3136fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3137fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31389566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31399566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31409566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3141f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3142f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3143f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31449566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31469566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3147fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3148d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3149fcdce8c4SStefano Zampini     c->nz                = 0;
3150fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3151fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3152fcdce8c4SStefano Zampini     goto finalizesym;
3153fcdce8c4SStefano Zampini   }
3154fcdce8c4SStefano Zampini 
315528b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
315628b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3157fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3158fcdce8c4SStefano Zampini   if (!biscompressed) {
3159fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3160fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3161fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3162fcdce8c4SStefano Zampini #endif
3163fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3164fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3165fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3166fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3167fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3168fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3169fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3170fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3171fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3172fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3173fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31749566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3175fcdce8c4SStefano Zampini     }
3176fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3177fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3178fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3179fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31809371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31819371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3182fcdce8c4SStefano Zampini     }
3183fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3184fcdce8c4SStefano Zampini #endif
3185fcdce8c4SStefano Zampini   }
318628b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
318728b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3188fcdce8c4SStefano Zampini   /* precompute flops count */
3189fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3190fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3191fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3192fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3193fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3194fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3195fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3196fcdce8c4SStefano Zampini       }
3197fcdce8c4SStefano Zampini     }
3198fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3199fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3200fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3201fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3202fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3203fcdce8c4SStefano Zampini     }
3204fcdce8c4SStefano Zampini   } else { /* TODO */
3205fcdce8c4SStefano Zampini     flops = 0.;
3206fcdce8c4SStefano Zampini   }
3207fcdce8c4SStefano Zampini 
3208fcdce8c4SStefano Zampini   mmdata->flops = flops;
32099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3210b4285af6SJunchao Zhang 
3211fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32131ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32141ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32159371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32169566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3217b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3218b4285af6SJunchao Zhang   {
3219b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3220b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3221b4285af6SJunchao Zhang   */
3222b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3223b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3224b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3225b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3226b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3227b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3228b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3229b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3230b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3231b4285af6SJunchao Zhang 
3232b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32339371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32349371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3236b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32379371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32389371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3239b4285af6SJunchao Zhang 
32409371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32419371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32459371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32469371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3249b4285af6SJunchao Zhang 
3250b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32519566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3252b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3253b4285af6SJunchao Zhang     /* allocate matrix C */
32549371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32559371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32569371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32579371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3258b4285af6SJunchao Zhang     /* update matC with the new pointers */
32599371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32609371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3261b4285af6SJunchao Zhang 
32629371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32639371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32649566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32659371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32669371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32689371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32699371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32709566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3271b4285af6SJunchao Zhang   }
3272ae37ee31SJunchao Zhang   #else
3273b4285af6SJunchao Zhang   size_t bufSize2;
3274fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32759371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32769371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32779566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3278fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32799371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32809371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3281fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32829371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32839371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3284fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3285fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3286fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3287fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3288fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32899566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3290fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32919371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32929371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3293fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32949566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3295fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
32969371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
32979371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3298fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32999566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3300fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33019566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33029371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33039371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33049371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33059371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3306ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3307fcdce8c4SStefano Zampini #else
33089566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33099371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33109371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33119371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3312fcdce8c4SStefano Zampini   c->nz                = cnz;
3313fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3315fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3317fcdce8c4SStefano Zampini 
33189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3319fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3320fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3321fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33229371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33239371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33249371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3325fcdce8c4SStefano Zampini #endif
33269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3328fcdce8c4SStefano Zampini finalizesym:
3329fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33309f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33319f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3332fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33337de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3334fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3335fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3336fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3337fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3338fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3339fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3342fcdce8c4SStefano Zampini   } else {
3343fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3344fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3347fcdce8c4SStefano Zampini   }
3348fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3349fcdce8c4SStefano Zampini     PetscInt r = 0;
3350fcdce8c4SStefano Zampini     c->i[0]    = 0;
3351fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3352fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3353fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3354fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3355fcdce8c4SStefano Zampini     }
3356fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3357fcdce8c4SStefano Zampini   }
33589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3361fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3362fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3363fcdce8c4SStefano Zampini   c->rmax          = 0;
3364fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3365fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3366fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3367fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3368fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3369fcdce8c4SStefano Zampini   }
33709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3371fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3372fcdce8c4SStefano Zampini 
3373fcdce8c4SStefano Zampini   C->nonzerostate++;
33749566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33759566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3376fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3377fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3378fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3379fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3380fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3381abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3382fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3383fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3384fcdce8c4SStefano Zampini   }
3385fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3387fcdce8c4SStefano Zampini }
3388fcdce8c4SStefano Zampini 
3389fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3390fcdce8c4SStefano Zampini 
3391fcdce8c4SStefano Zampini /* handles sparse or dense B */
3392d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3393d71ae5a4SJacob Faibussowitsch {
3394fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3395fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3396fcdce8c4SStefano Zampini 
3397fcdce8c4SStefano Zampini   PetscFunctionBegin;
3398fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
33999566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
340048a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3401fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3402fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
340348a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3404fcdce8c4SStefano Zampini   }
340565e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
340665e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
340765e4b4d4SStefano Zampini     switch (product->type) {
340865e4b4d4SStefano Zampini     case MATPRODUCT_AB:
340965e4b4d4SStefano Zampini       if (product->api_user) {
3410d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34119566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3412d0609cedSBarry Smith         PetscOptionsEnd();
341365e4b4d4SStefano Zampini       } else {
3414d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34159566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416d0609cedSBarry Smith         PetscOptionsEnd();
341765e4b4d4SStefano Zampini       }
341865e4b4d4SStefano Zampini       break;
341965e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
342065e4b4d4SStefano Zampini       if (product->api_user) {
3421d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34229566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3423d0609cedSBarry Smith         PetscOptionsEnd();
342465e4b4d4SStefano Zampini       } else {
3425d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34269566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427d0609cedSBarry Smith         PetscOptionsEnd();
342865e4b4d4SStefano Zampini       }
342965e4b4d4SStefano Zampini       break;
343065e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
343165e4b4d4SStefano Zampini       if (product->api_user) {
3432d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34339566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3434d0609cedSBarry Smith         PetscOptionsEnd();
343565e4b4d4SStefano Zampini       } else {
3436d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438d0609cedSBarry Smith         PetscOptionsEnd();
343965e4b4d4SStefano Zampini       }
344065e4b4d4SStefano Zampini       break;
344165e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
344265e4b4d4SStefano Zampini       if (product->api_user) {
3443d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34449566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3445d0609cedSBarry Smith         PetscOptionsEnd();
344665e4b4d4SStefano Zampini       } else {
3447d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449d0609cedSBarry Smith         PetscOptionsEnd();
345065e4b4d4SStefano Zampini       }
345165e4b4d4SStefano Zampini       break;
345265e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
345365e4b4d4SStefano Zampini       if (product->api_user) {
3454d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34559566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3456d0609cedSBarry Smith         PetscOptionsEnd();
345765e4b4d4SStefano Zampini       } else {
3458d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460d0609cedSBarry Smith         PetscOptionsEnd();
346165e4b4d4SStefano Zampini       }
346265e4b4d4SStefano Zampini       break;
3463d71ae5a4SJacob Faibussowitsch     default:
3464d71ae5a4SJacob Faibussowitsch       break;
346565e4b4d4SStefano Zampini     }
346665e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
346765e4b4d4SStefano Zampini   }
346865e4b4d4SStefano Zampini   /* dispatch */
3469fcdce8c4SStefano Zampini   if (isdense) {
3470ccdfe979SStefano Zampini     switch (product->type) {
3471ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3472ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3473ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3474ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3475ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3476fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34779566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3478fcdce8c4SStefano Zampini       } else {
3479fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3480fcdce8c4SStefano Zampini       }
3481fcdce8c4SStefano Zampini       break;
3482d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3483d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3484d71ae5a4SJacob Faibussowitsch       break;
3485d71ae5a4SJacob Faibussowitsch     default:
3486d71ae5a4SJacob Faibussowitsch       break;
3487ccdfe979SStefano Zampini     }
3488fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3489fcdce8c4SStefano Zampini     switch (product->type) {
3490fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3491fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3492d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3493d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3494d71ae5a4SJacob Faibussowitsch       break;
3495fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3496fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3497d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3498d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3499d71ae5a4SJacob Faibussowitsch       break;
3500d71ae5a4SJacob Faibussowitsch     default:
3501d71ae5a4SJacob Faibussowitsch       break;
3502fcdce8c4SStefano Zampini     }
3503fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35049566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3505fcdce8c4SStefano Zampini   }
35063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3507ccdfe979SStefano Zampini }
3508ccdfe979SStefano Zampini 
3509d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510d71ae5a4SJacob Faibussowitsch {
35119ae82921SPaul Mullowney   PetscFunctionBegin;
35129566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3514e6e9a74fSStefano Zampini }
3515e6e9a74fSStefano Zampini 
3516d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3517d71ae5a4SJacob Faibussowitsch {
3518e6e9a74fSStefano Zampini   PetscFunctionBegin;
35199566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3521e6e9a74fSStefano Zampini }
3522e6e9a74fSStefano Zampini 
3523d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3524d71ae5a4SJacob Faibussowitsch {
3525e6e9a74fSStefano Zampini   PetscFunctionBegin;
35269566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3528e6e9a74fSStefano Zampini }
3529e6e9a74fSStefano Zampini 
3530d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3531d71ae5a4SJacob Faibussowitsch {
3532e6e9a74fSStefano Zampini   PetscFunctionBegin;
35339566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35359ae82921SPaul Mullowney }
35369ae82921SPaul Mullowney 
3537d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538d71ae5a4SJacob Faibussowitsch {
3539ca45077fSPaul Mullowney   PetscFunctionBegin;
35409566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3542ca45077fSPaul Mullowney }
3543ca45077fSPaul Mullowney 
3544d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3545d71ae5a4SJacob Faibussowitsch {
3546a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3547a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3548a0e72f99SJunchao Zhang }
3549a0e72f99SJunchao Zhang 
3550afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3551d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3552d71ae5a4SJacob Faibussowitsch {
35539ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3554aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35559ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3556e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3557e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3558e6e9a74fSStefano Zampini   PetscBool                     compressed;
3559afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3561afb2bd1cSJunchao Zhang #endif
35626e111a19SKarl Rupp 
35639ae82921SPaul Mullowney   PetscFunctionBegin;
356408401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3565cbc6b225SStefano Zampini   if (!a->nz) {
3566995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3567995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35683ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3569e6e9a74fSStefano Zampini   }
357034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3572e6e9a74fSStefano Zampini   if (!trans) {
35739ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35745f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3575e6e9a74fSStefano Zampini   } else {
35761a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3577e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3578e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579e6e9a74fSStefano Zampini     } else {
35809566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3581e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3582e6e9a74fSStefano Zampini     }
3583e6e9a74fSStefano Zampini   }
3584e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3585e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3586213423ffSJunchao Zhang 
3587e6e9a74fSStefano Zampini   try {
35889566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
358969d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35909566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3591afb2bd1cSJunchao Zhang 
35929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3593e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3594afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3595afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3596afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3597afb2bd1cSJunchao Zhang       */
3598e6e9a74fSStefano Zampini       xptr = xarray;
3599afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3600213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3601afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3603afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3604afb2bd1cSJunchao Zhang        */
3605afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3606afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3607fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3608afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3609afb2bd1cSJunchao Zhang       }
3610afb2bd1cSJunchao Zhang #endif
3611e6e9a74fSStefano Zampini     } else {
3612afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3613afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3614afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3615afb2bd1cSJunchao Zhang        */
3616afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3617e6e9a74fSStefano Zampini       dptr = zarray;
3618e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3619afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3620e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3621d0967f54SJacob Faibussowitsch 
3622d0967f54SJacob Faibussowitsch         thrust::for_each(
3623d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3624d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3625d0967f54SJacob Faibussowitsch #endif
3626d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36279371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3628e6e9a74fSStefano Zampini       }
3629afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3633afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3634afb2bd1cSJunchao Zhang       }
3635afb2bd1cSJunchao Zhang #endif
3636e6e9a74fSStefano Zampini     }
36379ae82921SPaul Mullowney 
3638afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3639aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3640afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3642fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3643fe5544b9SJunchao Zhang   #else
3644fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3645fe5544b9SJunchao Zhang   #endif
3646fe5544b9SJunchao Zhang 
36475f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3648fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3649fe5544b9SJunchao Zhang       if (!matDescr) {
3650fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3651fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3652fe5544b9SJunchao Zhang       }
3653fe5544b9SJunchao Zhang   #endif
3654fe5544b9SJunchao Zhang 
3655afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36589371c9d4SSatish Balay         PetscCallCUSPARSE(
3659fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3661fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3662fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3663fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664fe5544b9SJunchao Zhang   #endif
3665afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3666afb2bd1cSJunchao Zhang       } else {
3667afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3670afb2bd1cSJunchao Zhang       }
3671afb2bd1cSJunchao Zhang 
3672fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3673afb2bd1cSJunchao Zhang #else
36747656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36759371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3676afb2bd1cSJunchao Zhang #endif
3677aa372e3fSPaul Mullowney     } else {
3678213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3679afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3680afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681afb2bd1cSJunchao Zhang #else
3682301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36839371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3684afb2bd1cSJunchao Zhang #endif
3685a65300a6SPaul Mullowney       }
3686aa372e3fSPaul Mullowney     }
36879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3688aa372e3fSPaul Mullowney 
3689e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3690213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3691213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3692995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3693e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3694995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
36957656d835SStefano Zampini         }
3696213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3697995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
36987656d835SStefano Zampini       }
36997656d835SStefano Zampini 
3700213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3701213423ffSJunchao Zhang       if (compressed) {
37029566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
37036497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37046497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
37059566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3706e6e9a74fSStefano Zampini       }
3707e6e9a74fSStefano Zampini     } else {
3708995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3709e6e9a74fSStefano Zampini     }
37109566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37119566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37129566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3713d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3714d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3715d71ae5a4SJacob Faibussowitsch   }
3716e6e9a74fSStefano Zampini   if (yy) {
37179566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3718e6e9a74fSStefano Zampini   } else {
37199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3720e6e9a74fSStefano Zampini   }
37213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37229ae82921SPaul Mullowney }
37239ae82921SPaul Mullowney 
3724d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3725d71ae5a4SJacob Faibussowitsch {
3726ca45077fSPaul Mullowney   PetscFunctionBegin;
37279566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3729ca45077fSPaul Mullowney }
3730ca45077fSPaul Mullowney 
37319ee18893SBarry Smith PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
37329ee18893SBarry Smith 
37339ee18893SBarry Smith __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
37349ee18893SBarry Smith {
37359ee18893SBarry Smith   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
37369ee18893SBarry Smith 
37379ee18893SBarry Smith   if (x < len) {
37389ee18893SBarry Smith     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
37399ee18893SBarry Smith     PetscScalar    d = 0.0;
37409ee18893SBarry Smith 
37419ee18893SBarry Smith     for (PetscInt i = 0; i < num_non0_row; i++) {
37429ee18893SBarry Smith       if (col[i + rowx] == x) {
37439ee18893SBarry Smith         d = val[i + rowx];
37449ee18893SBarry Smith         break;
37459ee18893SBarry Smith       }
37469ee18893SBarry Smith     }
37479ee18893SBarry Smith     diag[x] = d;
37489ee18893SBarry Smith   }
37499ee18893SBarry Smith }
37509ee18893SBarry Smith 
37519ee18893SBarry Smith static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
37529ee18893SBarry Smith {
37539ee18893SBarry Smith   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
37549ee18893SBarry Smith   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
37559ee18893SBarry Smith   PetscScalar                  *darray;
37569ee18893SBarry Smith 
37579ee18893SBarry Smith   PetscFunctionBegin;
37589ee18893SBarry Smith   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
37599ee18893SBarry Smith     PetscInt   n   = A->rmap->n;
37609ee18893SBarry Smith     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
37619ee18893SBarry Smith 
37629ee18893SBarry Smith     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
37639ee18893SBarry Smith     if (n > 0) {
37649ee18893SBarry Smith       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
37659ee18893SBarry Smith       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
37669ee18893SBarry Smith       PetscCallCUDA(cudaPeekAtLastError());
37679ee18893SBarry Smith       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
37689ee18893SBarry Smith     }
37699ee18893SBarry Smith   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
37709ee18893SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
37719ee18893SBarry Smith }
37729ee18893SBarry Smith 
3773d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3774d71ae5a4SJacob Faibussowitsch {
3775042217e8SBarry Smith   PetscFunctionBegin;
37769566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37789ae82921SPaul Mullowney }
37799ae82921SPaul Mullowney 
3780e057df02SPaul Mullowney /*@
378153220ed8SBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
37829ae82921SPaul Mullowney 
3783d083f849SBarry Smith   Collective
37849ae82921SPaul Mullowney 
37859ae82921SPaul Mullowney   Input Parameters:
378611a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37879ae82921SPaul Mullowney . m    - number of rows
37889ae82921SPaul Mullowney . n    - number of columns
378920f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
379020f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37919ae82921SPaul Mullowney 
37929ae82921SPaul Mullowney   Output Parameter:
37939ae82921SPaul Mullowney . A - the matrix
37949ae82921SPaul Mullowney 
37952ef1f0ffSBarry Smith   Level: intermediate
37962ef1f0ffSBarry Smith 
37972ef1f0ffSBarry Smith   Notes:
37982920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37992920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
38002920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
38012920cce0SJacob Faibussowitsch 
380211a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
38039ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
380411a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
38059ae82921SPaul Mullowney 
380611a5261eSBarry Smith   The AIJ format, also called
38072ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
38089ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
380920f4b53cSBarry Smith   either one (as in Fortran) or zero.
38109ae82921SPaul Mullowney 
38119ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
38122ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
381320f4b53cSBarry Smith   allocation.
38149ae82921SPaul Mullowney 
381553220ed8SBarry Smith   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
381653220ed8SBarry Smith 
381753220ed8SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
381853220ed8SBarry Smith           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
38199ae82921SPaul Mullowney @*/
3820d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3821d71ae5a4SJacob Faibussowitsch {
38229ae82921SPaul Mullowney   PetscFunctionBegin;
38239566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
38249566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
38259566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
38269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
38273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38289ae82921SPaul Mullowney }
38299ae82921SPaul Mullowney 
3830d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3831d71ae5a4SJacob Faibussowitsch {
38329ae82921SPaul Mullowney   PetscFunctionBegin;
38339ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
38342c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
38359ae82921SPaul Mullowney   } else {
38369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3837aa372e3fSPaul Mullowney   }
38389566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38399566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38419566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38429566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38439566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38469566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38479566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38489566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38509ae82921SPaul Mullowney }
38519ae82921SPaul Mullowney 
3852ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
385395639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3854d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3855d71ae5a4SJacob Faibussowitsch {
38569ff858a8SKarl Rupp   PetscFunctionBegin;
38579566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38589566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38609ff858a8SKarl Rupp }
38619ff858a8SKarl Rupp 
3862d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3863d71ae5a4SJacob Faibussowitsch {
3864a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3865039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3866039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3867039c6fbaSStefano Zampini   PetscScalar        *ay;
3868039c6fbaSStefano Zampini   const PetscScalar  *ax;
3869039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3870e6e9a74fSStefano Zampini 
387195639643SRichard Tran Mills   PetscFunctionBegin;
3872a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3873a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3874039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38769566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38773ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
387895639643SRichard Tran Mills   }
3879039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38825f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38835f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3884039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3885039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3886039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3887039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3888039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3889ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3890039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3891039c6fbaSStefano Zampini   }
3892d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3893d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3894039c6fbaSStefano Zampini 
3895039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3896039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3897039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3898039c6fbaSStefano Zampini     size_t bufferSize;
3899039c6fbaSStefano Zampini     void  *buffer;
3900039c6fbaSStefano Zampini #endif
3901039c6fbaSStefano Zampini 
39029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39049566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3905039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
39069371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39079371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
39089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
39099566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39109371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39119371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
39129566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3915039c6fbaSStefano Zampini #else
39169566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39179371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39189371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
39199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39209566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3921039c6fbaSStefano Zampini #endif
39229566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
39239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3925039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3926a587d139SMark     cublasHandle_t cublasv2handle;
3927a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3928039c6fbaSStefano Zampini 
39299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39319566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39329566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
39339566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39349566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
39359566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39369566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39389566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3939039c6fbaSStefano Zampini   } else {
39409566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39419566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3942a587d139SMark   }
39433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
394495639643SRichard Tran Mills }
394595639643SRichard Tran Mills 
3946d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3947d71ae5a4SJacob Faibussowitsch {
394833c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
394933c9ba73SStefano Zampini   PetscScalar   *ay;
395033c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
395133c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
395233c9ba73SStefano Zampini 
395333c9ba73SStefano Zampini   PetscFunctionBegin;
39549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39559566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39569566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39589566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
396333c9ba73SStefano Zampini }
396433c9ba73SStefano Zampini 
3965d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3966d71ae5a4SJacob Faibussowitsch {
3967f5d0f301SBarry Smith   PetscBool   gpu = PETSC_FALSE;
3968a587d139SMark   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;
39697e8381f9SStefano Zampini 
39703fa6b06aSMark Adams   PetscFunctionBegin;
39713fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39723fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39737e8381f9SStefano Zampini     if (spptr->mat) {
39747e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39757e8381f9SStefano Zampini       if (matrix->values) {
3976f5d0f301SBarry Smith         gpu = PETSC_TRUE;
39777e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39787e8381f9SStefano Zampini       }
39797e8381f9SStefano Zampini     }
39807e8381f9SStefano Zampini     if (spptr->matTranspose) {
39817e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3982ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39837e8381f9SStefano Zampini     }
39843fa6b06aSMark Adams   }
3985f5d0f301SBarry Smith   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3986f5d0f301SBarry Smith   else {
39879566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3988f5d0f301SBarry Smith     A->offloadmask = PETSC_OFFLOAD_CPU;
3989f5d0f301SBarry Smith   }
39903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39913fa6b06aSMark Adams }
39923fa6b06aSMark Adams 
39932c55c4ccSJose E. Roman static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
399403db1824SAlex Lindsay {
399503db1824SAlex Lindsay   PetscFunctionBegin;
399603db1824SAlex Lindsay   *m = PETSC_MEMTYPE_CUDA;
399703db1824SAlex Lindsay   PetscFunctionReturn(PETSC_SUCCESS);
399803db1824SAlex Lindsay }
399903db1824SAlex Lindsay 
4000d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4001d71ae5a4SJacob Faibussowitsch {
4002a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4003a587d139SMark 
4004a587d139SMark   PetscFunctionBegin;
40059a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
40069a14fc28SStefano Zampini     A->boundtocpu = flg;
40073ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
40089a14fc28SStefano Zampini   }
4009a587d139SMark   if (flg) {
40109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4011a587d139SMark 
401233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
40139ee18893SBarry Smith     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4014a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
4015a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4016a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
4017a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4018a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4019a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4020a587d139SMark     A->ops->multhermitiantranspose    = NULL;
4021a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
4022fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
402303db1824SAlex Lindsay     A->ops->getcurrentmemtype         = NULL;
40249566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
40259566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
40269566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
40279566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
40289566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
40299566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
40309566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4031a587d139SMark   } else {
403233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
40339ee18893SBarry Smith     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4034a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4035a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4036a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4037a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4038a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4039a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4040a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4041a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4042fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
404303db1824SAlex Lindsay     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
404467a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
404567a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
404667a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
404767a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
404867a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
404967a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40507ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40517ee59b9bSJunchao Zhang 
40529566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40539566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40549566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40559566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40569566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40579566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058a587d139SMark   }
4059a587d139SMark   A->boundtocpu = flg;
40604d12350bSJunchao Zhang   if (flg && a->inode.size_csr) {
4061ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4062ea500dcfSRichard Tran Mills   } else {
4063ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4064ea500dcfSRichard Tran Mills   }
40653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4066a587d139SMark }
4067a587d139SMark 
40688eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4069d71ae5a4SJacob Faibussowitsch {
407049735bf3SStefano Zampini   Mat B;
40719ae82921SPaul Mullowney 
40729ae82921SPaul Mullowney   PetscFunctionBegin;
40739566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
407449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40759566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
407649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40779566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
407849735bf3SStefano Zampini   }
407949735bf3SStefano Zampini   B = *newmat;
408049735bf3SStefano Zampini 
40819566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40829566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
408334136279SStefano Zampini 
408449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40859ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4086e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40879566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40889566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40899566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40901a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4091d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4092b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4093a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4094a435da06SStefano Zampini   #else
4095d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4096a435da06SStefano Zampini   #endif
4097d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4098d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4099d8132acaSStefano Zampini #endif
41001a2c6b5cSJunchao Zhang       B->spptr = spptr;
41019ae82921SPaul Mullowney     } else {
4102e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4103e6e9a74fSStefano Zampini 
41049566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
41059566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
41069566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4107e6e9a74fSStefano Zampini       B->spptr = spptr;
41089ae82921SPaul Mullowney     }
4109e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
411049735bf3SStefano Zampini   }
4111693b0035SStefano Zampini   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
41129ae82921SPaul Mullowney   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
41131a2c6b5cSJunchao Zhang   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
41149ae82921SPaul Mullowney   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
411595639643SRichard Tran Mills   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4116693b0035SStefano Zampini   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
411703db1824SAlex Lindsay   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
41182205254eSKarl Rupp 
41199566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
41209566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
41219566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4122ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
41239566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4124ae48a8d0SStefano Zampini #endif
41259566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
41263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41279ae82921SPaul Mullowney }
41289ae82921SPaul Mullowney 
4129d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4130d71ae5a4SJacob Faibussowitsch {
413102fe1965SBarry Smith   PetscFunctionBegin;
41329566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
41339566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
41343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
413502fe1965SBarry Smith }
413602fe1965SBarry Smith 
41373ca39a21SBarry Smith /*MC
413853220ed8SBarry Smith    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4139e057df02SPaul Mullowney 
4140e057df02SPaul Mullowney    Options Database Keys:
414153220ed8SBarry Smith +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
414253220ed8SBarry Smith .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41432ef1f0ffSBarry Smith                                            Other options include ell (ellpack) or hyb (hybrid).
414453220ed8SBarry Smith .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
414553220ed8SBarry Smith -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4146e057df02SPaul Mullowney 
4147e057df02SPaul Mullowney   Level: beginner
4148e057df02SPaul Mullowney 
414953220ed8SBarry Smith   Notes:
415053220ed8SBarry Smith   These matrices can be in either CSR, ELL, or HYB format.
415153220ed8SBarry Smith 
415253220ed8SBarry Smith   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
415353220ed8SBarry Smith 
415453220ed8SBarry Smith   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
415553220ed8SBarry Smith   if some integer values passed in do not fit in `int`.
415653220ed8SBarry Smith 
41571cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4158e057df02SPaul Mullowney M*/
41597f756511SDominic Meiser 
4160d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4161d71ae5a4SJacob Faibussowitsch {
416242c9c57cSBarry Smith   PetscFunctionBegin;
41639566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41649566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41659566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41669566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
416842c9c57cSBarry Smith }
416929b38603SBarry Smith 
41702c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4171d71ae5a4SJacob Faibussowitsch {
41722c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4173cbc6b225SStefano Zampini 
4174cbc6b225SStefano Zampini   PetscFunctionBegin;
41752c4ab24aSJunchao Zhang   if (cusp) {
41762c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41772c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41782c4ab24aSJunchao Zhang     delete cusp->workVector;
41792c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41802c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41812c4ab24aSJunchao Zhang     delete cusp->coords;
41822c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41832c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41847f756511SDominic Meiser   }
41853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41867f756511SDominic Meiser }
41877f756511SDominic Meiser 
4188d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4189d71ae5a4SJacob Faibussowitsch {
41907f756511SDominic Meiser   PetscFunctionBegin;
41917f756511SDominic Meiser   if (*mat) {
41927f756511SDominic Meiser     delete (*mat)->values;
41937f756511SDominic Meiser     delete (*mat)->column_indices;
41947f756511SDominic Meiser     delete (*mat)->row_offsets;
41957f756511SDominic Meiser     delete *mat;
41967f756511SDominic Meiser     *mat = 0;
41977f756511SDominic Meiser   }
41983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41997f756511SDominic Meiser }
42007f756511SDominic Meiser 
4201b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4202d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4203d71ae5a4SJacob Faibussowitsch {
42047f756511SDominic Meiser   PetscFunctionBegin;
42057f756511SDominic Meiser   if (*trifactor) {
42069566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4207261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
42089566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
42099566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
42109566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4211afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
42129566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4213afb2bd1cSJunchao Zhang   #endif
42149566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
42157f756511SDominic Meiser   }
42163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42177f756511SDominic Meiser }
4218d460d7bfSJunchao Zhang #endif
42197f756511SDominic Meiser 
4220d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4221d71ae5a4SJacob Faibussowitsch {
42227f756511SDominic Meiser   CsrMatrix *mat;
42237f756511SDominic Meiser 
42247f756511SDominic Meiser   PetscFunctionBegin;
42257f756511SDominic Meiser   if (*matstruct) {
42267f756511SDominic Meiser     if ((*matstruct)->mat) {
42277f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4228afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4229afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4230afb2bd1cSJunchao Zhang #else
42317f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
42329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4233afb2bd1cSJunchao Zhang #endif
42347f756511SDominic Meiser       } else {
42357f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
42363ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
42377f756511SDominic Meiser       }
42387f756511SDominic Meiser     }
42399566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
42407f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
42419566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
42429566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42439566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4244afb2bd1cSJunchao Zhang 
4245afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4246afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42479566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4248fe5544b9SJunchao Zhang 
4249afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4250afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42519566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42539566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4254fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4255fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4256fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4257fe5544b9SJunchao Zhang   #endif
4258afb2bd1cSJunchao Zhang       }
4259afb2bd1cSJunchao Zhang     }
4260afb2bd1cSJunchao Zhang #endif
42617f756511SDominic Meiser     delete *matstruct;
42627e8381f9SStefano Zampini     *matstruct = NULL;
42637f756511SDominic Meiser   }
42643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42657f756511SDominic Meiser }
42667f756511SDominic Meiser 
4267d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4268d71ae5a4SJacob Faibussowitsch {
4269da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4270da112707SJunchao Zhang 
42717f756511SDominic Meiser   PetscFunctionBegin;
4272da112707SJunchao Zhang   if (fs) {
4273b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4274da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4275da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4276da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4277da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4278d460d7bfSJunchao Zhang     delete fs->workVector;
4279d460d7bfSJunchao Zhang     fs->workVector = NULL;
4280d460d7bfSJunchao Zhang #endif
4281da112707SJunchao Zhang     delete fs->rpermIndices;
4282da112707SJunchao Zhang     delete fs->cpermIndices;
4283da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4284da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4285da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4286b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4287da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4288da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
428930807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
429030807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4291da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4292d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4293da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4294da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
429512ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4296da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4297da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
429812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4299da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4300da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4301da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4302da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4303da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4304da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4305da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4306da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4307da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4308da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4309da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4310da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4311d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4312d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4313d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
431412ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
431512ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4316da112707SJunchao Zhang #endif
4317ccdfe979SStefano Zampini   }
43183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4319ccdfe979SStefano Zampini }
4320ccdfe979SStefano Zampini 
4321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4322d71ae5a4SJacob Faibussowitsch {
4323ccdfe979SStefano Zampini   PetscFunctionBegin;
4324ccdfe979SStefano Zampini   if (*trifactors) {
43259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4326f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
43279566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
43287f756511SDominic Meiser   }
43293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43307f756511SDominic Meiser }
43317e8381f9SStefano Zampini 
43329371c9d4SSatish Balay struct IJCompare {
4333d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4334d71ae5a4SJacob Faibussowitsch   {
43350b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
43360b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
43377e8381f9SStefano Zampini     return false;
43387e8381f9SStefano Zampini   }
43397e8381f9SStefano Zampini };
43407e8381f9SStefano Zampini 
434166976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4342d71ae5a4SJacob Faibussowitsch {
4343a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4344a49f1ed0SStefano Zampini 
4345a49f1ed0SStefano Zampini   PetscFunctionBegin;
4346a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43473ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4348a49f1ed0SStefano Zampini   if (destroy) {
43499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4350a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4351a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4352a49f1ed0SStefano Zampini   }
43531a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4355a49f1ed0SStefano Zampini }
4356a49f1ed0SStefano Zampini 
435749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4358d71ae5a4SJacob Faibussowitsch {
435949abdd8aSBarry Smith   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
43604d86920dSPierre Jolivet 
43617e8381f9SStefano Zampini   PetscFunctionBegin;
43622c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43632c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43642c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43667e8381f9SStefano Zampini }
4367ed502f03SStefano Zampini 
436866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4369d71ae5a4SJacob Faibussowitsch {
43702c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43712c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43722c4ab24aSJunchao Zhang   PetscInt            *i, *j;
437303e76207SPierre Jolivet   PetscContainer       container_h;
43742c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4375219fbbafSJunchao Zhang 
4376219fbbafSJunchao Zhang   PetscFunctionBegin;
43779566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43782c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43792c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43802c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43812c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43822c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43832c4ab24aSJunchao Zhang   } else {
43842c4ab24aSJunchao Zhang     i = coo_i;
43852c4ab24aSJunchao Zhang     j = coo_j;
4386219fbbafSJunchao Zhang   }
4387219fbbafSJunchao Zhang 
43882c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43892c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4390cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43912c4ab24aSJunchao Zhang   // Create the GPU memory
43929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43932c4ab24aSJunchao Zhang 
43942c4ab24aSJunchao Zhang   // Copy the COO struct to device
43952c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
43962c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
43972c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
43982c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
43992c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
44002c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
44012c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
44022c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
44032c4ab24aSJunchao Zhang 
44042c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
440503e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
44063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4407219fbbafSJunchao Zhang }
4408219fbbafSJunchao Zhang 
4409d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4410d71ae5a4SJacob Faibussowitsch {
4411219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4412219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4413b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4414b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4415b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4416b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4417b6c38306SJunchao Zhang   }
4418219fbbafSJunchao Zhang }
4419219fbbafSJunchao Zhang 
442066976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4421d71ae5a4SJacob Faibussowitsch {
4422219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4423219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4424219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4425219fbbafSJunchao Zhang   PetscMemType         memtype;
4426219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4427219fbbafSJunchao Zhang   PetscScalar         *Aa;
44282c4ab24aSJunchao Zhang   PetscContainer       container;
44292c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4430219fbbafSJunchao Zhang 
4431219fbbafSJunchao Zhang   PetscFunctionBegin;
44322c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44332c4ab24aSJunchao Zhang 
44342c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
44352c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
44362c4ab24aSJunchao Zhang 
44379566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4438219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44392c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
44402c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4441219fbbafSJunchao Zhang   }
4442219fbbafSJunchao Zhang 
44439566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44449566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4445219fbbafSJunchao Zhang 
444608bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4447cbc6b225SStefano Zampini   if (Annz) {
44486497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4450cbc6b225SStefano Zampini   }
445108bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4452219fbbafSJunchao Zhang 
44539566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44549566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4455219fbbafSJunchao Zhang 
44569566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4458219fbbafSJunchao Zhang }
4459219fbbafSJunchao Zhang 
44605b7e41feSStefano Zampini /*@C
44612ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44625b7e41feSStefano Zampini 
44632ef1f0ffSBarry Smith   Not Collective
44645b7e41feSStefano Zampini 
44655b7e41feSStefano Zampini   Input Parameters:
44665b7e41feSStefano Zampini + A          - the matrix
446711a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44685b7e41feSStefano Zampini 
44695b7e41feSStefano Zampini   Output Parameters:
447053220ed8SBarry Smith + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
447153220ed8SBarry Smith - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
44725b7e41feSStefano Zampini 
44735b7e41feSStefano Zampini   Level: developer
44745b7e41feSStefano Zampini 
447511a5261eSBarry Smith   Note:
44765b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44775b7e41feSStefano Zampini 
44781cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44795b7e41feSStefano Zampini @*/
4480d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4481d71ae5a4SJacob Faibussowitsch {
44825f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44835f101d05SStefano Zampini   CsrMatrix          *csr;
44845f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44855f101d05SStefano Zampini 
44865f101d05SStefano Zampini   PetscFunctionBegin;
44875f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44883ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44895f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4490aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
449228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44935f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44945f101d05SStefano Zampini   if (i) {
44955f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44965f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44975f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44985f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44999566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
45005f101d05SStefano Zampini       }
45015f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
45025f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
45035f101d05SStefano Zampini   }
45045f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
45053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45065f101d05SStefano Zampini }
45075f101d05SStefano Zampini 
45085b7e41feSStefano Zampini /*@C
45092ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
45105b7e41feSStefano Zampini 
45112ef1f0ffSBarry Smith   Not Collective
45125b7e41feSStefano Zampini 
45135b7e41feSStefano Zampini   Input Parameters:
45145b7e41feSStefano Zampini + A          - the matrix
45152ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
451620f4b53cSBarry Smith . i          - the CSR row pointers
451720f4b53cSBarry Smith - j          - the CSR column indices
45185b7e41feSStefano Zampini 
45195b7e41feSStefano Zampini   Level: developer
45205b7e41feSStefano Zampini 
45211cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
45225b7e41feSStefano Zampini @*/
452320f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4524d71ae5a4SJacob Faibussowitsch {
45255f101d05SStefano Zampini   PetscFunctionBegin;
45265f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45275f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45285f101d05SStefano Zampini   if (i) *i = NULL;
45295f101d05SStefano Zampini   if (j) *j = NULL;
453020f4b53cSBarry Smith   (void)compressed;
45313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45325f101d05SStefano Zampini }
45335f101d05SStefano Zampini 
45345b7e41feSStefano Zampini /*@C
453553220ed8SBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
45365b7e41feSStefano Zampini 
45375b7e41feSStefano Zampini   Not Collective
45385b7e41feSStefano Zampini 
45395b7e41feSStefano Zampini   Input Parameter:
454011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45415b7e41feSStefano Zampini 
45425b7e41feSStefano Zampini   Output Parameter:
45435b7e41feSStefano Zampini . a - pointer to the device data
45445b7e41feSStefano Zampini 
45455b7e41feSStefano Zampini   Level: developer
45465b7e41feSStefano Zampini 
454711a5261eSBarry Smith   Note:
454853220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
45495b7e41feSStefano Zampini 
45501cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45515b7e41feSStefano Zampini @*/
4552d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4553d71ae5a4SJacob Faibussowitsch {
4554ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4555ed502f03SStefano Zampini   CsrMatrix          *csr;
4556ed502f03SStefano Zampini 
4557ed502f03SStefano Zampini   PetscFunctionBegin;
4558ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45594f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4560ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4561aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
456328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4564ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
456528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4566ed502f03SStefano Zampini   *a = csr->values->data().get();
45673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4568ed502f03SStefano Zampini }
4569ed502f03SStefano Zampini 
45705b7e41feSStefano Zampini /*@C
457111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45725b7e41feSStefano Zampini 
45735b7e41feSStefano Zampini   Not Collective
45745b7e41feSStefano Zampini 
45752ef1f0ffSBarry Smith   Input Parameters:
45762ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45772ef1f0ffSBarry Smith - a - pointer to the device data
45785b7e41feSStefano Zampini 
45795b7e41feSStefano Zampini   Level: developer
45805b7e41feSStefano Zampini 
45811cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45825b7e41feSStefano Zampini @*/
4583d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4584d71ae5a4SJacob Faibussowitsch {
4585ed502f03SStefano Zampini   PetscFunctionBegin;
4586ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45874f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4588ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4589ed502f03SStefano Zampini   *a = NULL;
45903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4591ed502f03SStefano Zampini }
4592ed502f03SStefano Zampini 
45935b7e41feSStefano Zampini /*@C
459411a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45955b7e41feSStefano Zampini 
45965b7e41feSStefano Zampini   Not Collective
45975b7e41feSStefano Zampini 
45985b7e41feSStefano Zampini   Input Parameter:
459911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46005b7e41feSStefano Zampini 
46015b7e41feSStefano Zampini   Output Parameter:
46025b7e41feSStefano Zampini . a - pointer to the device data
46035b7e41feSStefano Zampini 
46045b7e41feSStefano Zampini   Level: developer
46055b7e41feSStefano Zampini 
460611a5261eSBarry Smith   Note:
460753220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
46085b7e41feSStefano Zampini 
46091cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
46105b7e41feSStefano Zampini @*/
4611d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4612d71ae5a4SJacob Faibussowitsch {
4613039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4614039c6fbaSStefano Zampini   CsrMatrix          *csr;
4615039c6fbaSStefano Zampini 
4616039c6fbaSStefano Zampini   PetscFunctionBegin;
4617039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46184f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4619039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4620aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
46219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
462228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4623039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
462428b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4625039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4626039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4629039c6fbaSStefano Zampini }
46305b7e41feSStefano Zampini /*@C
463111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4632039c6fbaSStefano Zampini 
46335b7e41feSStefano Zampini   Not Collective
46345b7e41feSStefano Zampini 
46352ef1f0ffSBarry Smith   Input Parameters:
46362ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46372ef1f0ffSBarry Smith - a - pointer to the device data
46385b7e41feSStefano Zampini 
46395b7e41feSStefano Zampini   Level: developer
46405b7e41feSStefano Zampini 
46411cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46425b7e41feSStefano Zampini @*/
4643d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4644d71ae5a4SJacob Faibussowitsch {
4645039c6fbaSStefano Zampini   PetscFunctionBegin;
4646039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46474f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4648039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46499566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4650039c6fbaSStefano Zampini   *a = NULL;
46513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4652039c6fbaSStefano Zampini }
4653039c6fbaSStefano Zampini 
46545b7e41feSStefano Zampini /*@C
465511a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46565b7e41feSStefano Zampini 
46575b7e41feSStefano Zampini   Not Collective
46585b7e41feSStefano Zampini 
46595b7e41feSStefano Zampini   Input Parameter:
466011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46615b7e41feSStefano Zampini 
46625b7e41feSStefano Zampini   Output Parameter:
46635b7e41feSStefano Zampini . a - pointer to the device data
46645b7e41feSStefano Zampini 
46655b7e41feSStefano Zampini   Level: developer
46665b7e41feSStefano Zampini 
466711a5261eSBarry Smith   Note:
466853220ed8SBarry Smith   Does not trigger any host to device copies.
466953220ed8SBarry Smith 
467053220ed8SBarry Smith   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
46715b7e41feSStefano Zampini 
46721cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46735b7e41feSStefano Zampini @*/
4674d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4675d71ae5a4SJacob Faibussowitsch {
4676ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4677ed502f03SStefano Zampini   CsrMatrix          *csr;
4678ed502f03SStefano Zampini 
4679ed502f03SStefano Zampini   PetscFunctionBegin;
4680ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46814f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4682ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4683aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
468428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4685ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
468628b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4687ed502f03SStefano Zampini   *a             = csr->values->data().get();
4688039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46899566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4691ed502f03SStefano Zampini }
4692ed502f03SStefano Zampini 
46935b7e41feSStefano Zampini /*@C
469411a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46955b7e41feSStefano Zampini 
46965b7e41feSStefano Zampini   Not Collective
46975b7e41feSStefano Zampini 
46982ef1f0ffSBarry Smith   Input Parameters:
46992ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
47002ef1f0ffSBarry Smith - a - pointer to the device data
47015b7e41feSStefano Zampini 
47025b7e41feSStefano Zampini   Level: developer
47035b7e41feSStefano Zampini 
47041cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
47055b7e41feSStefano Zampini @*/
4706d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4707d71ae5a4SJacob Faibussowitsch {
4708ed502f03SStefano Zampini   PetscFunctionBegin;
4709ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
47104f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4711ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
47129566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4713ed502f03SStefano Zampini   *a = NULL;
47143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4715ed502f03SStefano Zampini }
4716ed502f03SStefano Zampini 
47179371c9d4SSatish Balay struct IJCompare4 {
4718d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4719d71ae5a4SJacob Faibussowitsch   {
47200b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
47210b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4722ed502f03SStefano Zampini     return false;
4723ed502f03SStefano Zampini   }
4724ed502f03SStefano Zampini };
4725ed502f03SStefano Zampini 
47269371c9d4SSatish Balay struct Shift {
4727ed502f03SStefano Zampini   int _shift;
4728ed502f03SStefano Zampini 
4729ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
47309371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4731ed502f03SStefano Zampini };
4732ed502f03SStefano Zampini 
473321afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4734d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4735d71ae5a4SJacob Faibussowitsch {
4736ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4737ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4738ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4739ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4740ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4741ed502f03SStefano Zampini   cusparseStatus_t              stat;
4742ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4743ed502f03SStefano Zampini 
4744ed502f03SStefano Zampini   PetscFunctionBegin;
4745ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4746ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47474f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4748ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4749ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47505f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
475108401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4752aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4753aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4754ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4755ed502f03SStefano Zampini     m = A->rmap->n;
4756ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47579566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47589566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47599566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4760ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4761ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4762ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4763ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4764ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4765ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4766ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4767ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4768ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4769ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4770ed502f03SStefano Zampini     Ccusp->nrows            = m;
4771ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4772ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4773ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4774ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47759566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47769566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47779566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4778f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4779f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4780f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47819566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47829566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47839566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47849566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47859566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
478628b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
478728b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4788ed502f03SStefano Zampini 
4789ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4790ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4791ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4792ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4793ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4794ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4795ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4796ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4797ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
47982c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4799ed502f03SStefano Zampini     if (c->nz) {
48002ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
48012ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
48022ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
48032ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
48042ed87e7eSStefano Zampini 
4805ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4806ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4807ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4808ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
48099566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4810ed502f03SStefano Zampini         }
48112ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
48122ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4813ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4814ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4815ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4816ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
48179566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4818ed502f03SStefano Zampini         }
48192ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
48202ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
48219566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48229371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48239371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48249371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48259371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48262ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
48272ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
48282ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
48298909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4830ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4831ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
48328909a122SStefano Zampini #else
48338909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
48348909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48358909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48368909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48378909a122SStefano Zampini #endif
48382ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48392ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48402ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48412ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48422ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48432ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48442c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48452c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4846ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4847792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48488909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48498909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48508909a122SStefano Zampini #endif
48512ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48522ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48532ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4854792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48552ed87e7eSStefano Zampini #else
485659c3d2bbSPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
48572ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
485859c3d2bbSPierre Jolivet   #else
485959c3d2bbSPierre Jolivet       auto pred = cuda::std::identity();
486059c3d2bbSPierre Jolivet   #endif
4861792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4862792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48632ed87e7eSStefano Zampini #endif
48649371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48659371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48669566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48672ed87e7eSStefano Zampini       delete wPerm;
48682ed87e7eSStefano Zampini       delete Acoo;
48692ed87e7eSStefano Zampini       delete Bcoo;
48702ed87e7eSStefano Zampini       delete Ccoo;
4871ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48729371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48739371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4874ed502f03SStefano Zampini #endif
48751a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48769566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48779566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4878ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4879ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4880ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4881ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4882ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4883ed502f03SStefano Zampini 
48841a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48851a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4886a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4887ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4888ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4889ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4890ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4891ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4892ed502f03SStefano Zampini 
4893ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4894ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4895ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4896ed502f03SStefano Zampini 
48979566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4898ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4899ed502f03SStefano Zampini         if (AT) {
4900ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4901ed502f03SStefano Zampini           thrust::advance(rT, -1);
4902ed502f03SStefano Zampini         }
4903ed502f03SStefano Zampini         if (BT) {
4904ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4905ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4906ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4907ed502f03SStefano Zampini         }
4908ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4909ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4910ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4911ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4912ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4913ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49149566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4915ed502f03SStefano Zampini 
49169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
49179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
49189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4919f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4920f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4921f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
49229566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4925ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
49269371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
49279371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4928ed502f03SStefano Zampini #endif
4929ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4930ed502f03SStefano Zampini       }
4931ed502f03SStefano Zampini     }
4932ed502f03SStefano Zampini 
4933ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
49349f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
49359f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4936ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
49377de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4938ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4939ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4940ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4941ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49429566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49439566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4944ed502f03SStefano Zampini     } else {
49459566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49469566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4947ed502f03SStefano Zampini     }
49489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49499566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49509566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4951ed502f03SStefano Zampini     c->maxnz         = c->nz;
4952ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4953ed502f03SStefano Zampini     c->rmax          = 0;
4954ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4955ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4956ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4957ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4958ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4959ed502f03SStefano Zampini     }
49609566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4961ed502f03SStefano Zampini     (*C)->nonzerostate++;
49629566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49639566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4964ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4965ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4966ed502f03SStefano Zampini   } else {
496708401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4968ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4969ed502f03SStefano Zampini     if (c->nz) {
4970ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49712c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4972aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
497308401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49749566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49759566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49765f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49775f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4978ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4979ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4980ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4981aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4982aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4983aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4984aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49852c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49862c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4987ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49889566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49892c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49909371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4991ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49929371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49932c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4994ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49959566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49961a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49975f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4998ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4999ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5000ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5001ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5002ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
5003ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5004ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
50051a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
5006ed502f03SStefano Zampini       }
50079566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
5008ed502f03SStefano Zampini     }
5009ed502f03SStefano Zampini   }
50109566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5011ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
5012ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
5013ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
50143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5015ed502f03SStefano Zampini }
5016c215019aSStefano Zampini 
5017d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5018d71ae5a4SJacob Faibussowitsch {
5019c215019aSStefano Zampini   bool               dmem;
5020c215019aSStefano Zampini   const PetscScalar *av;
5021c215019aSStefano Zampini 
5022c215019aSStefano Zampini   PetscFunctionBegin;
5023c215019aSStefano Zampini   dmem = isCudaMem(v);
50249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5025c215019aSStefano Zampini   if (n && idx) {
5026c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5027c215019aSStefano Zampini     widx.assign(idx, idx + n);
50289566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5029c215019aSStefano Zampini 
5030c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5031c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5032c215019aSStefano Zampini     if (dmem) {
5033c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5034c215019aSStefano Zampini     } else {
5035c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5036c215019aSStefano Zampini       dv = w->data();
5037c215019aSStefano Zampini     }
5038c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5039c215019aSStefano Zampini 
5040c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5041c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5042c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
504348a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5044c215019aSStefano Zampini     delete w;
5045c215019aSStefano Zampini   } else {
50469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5047c215019aSStefano Zampini   }
50489566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5051c215019aSStefano Zampini }
5052