xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4e8208cbcbc709572b8abe32f33c78b69c819375)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14
18d0967f54SJacob Faibussowitsch #endif
19a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
20a2cee5feSJed Brown #include <thrust/remove.h>
21a2cee5feSJed Brown #include <thrust/sort.h>
22a2cee5feSJed Brown #include <thrust/unique.h>
2359c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
2459c3d2bbSPierre Jolivet   #include <cuda/std/functional>
2559c3d2bbSPierre Jolivet #endif
26e8d2b73aSMark Adams 
27e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2946aba097SBarry Smith /*
3046aba097SBarry Smith   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31afb2bd1cSJunchao Zhang   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32afb2bd1cSJunchao Zhang */
33afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36afb2bd1cSJunchao Zhang #endif
379ae82921SPaul Mullowney 
38087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
416fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
436fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
466fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48d460d7bfSJunchao Zhang #endif
49ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
5133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
526fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
536fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
546fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
599ae82921SPaul Mullowney 
607f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
632c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
647f756511SDominic Meiser 
6557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
6757181aedSStefano Zampini 
68c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71c215019aSStefano Zampini 
MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)72d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73d71ae5a4SJacob Faibussowitsch {
74aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
756e111a19SKarl Rupp 
76ca45077fSPaul Mullowney   PetscFunctionBegin;
77ca45077fSPaul Mullowney   switch (op) {
78d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
79d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
80d71ae5a4SJacob Faibussowitsch     break;
81d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
82d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
83d71ae5a4SJacob Faibussowitsch     break;
84d71ae5a4SJacob Faibussowitsch   default:
85d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86ca45077fSPaul Mullowney   }
873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
88ca45077fSPaul Mullowney }
899ae82921SPaul Mullowney 
90e057df02SPaul Mullowney /*@
9111a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
9211a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
9311a5261eSBarry Smith 
94e057df02SPaul Mullowney   Not Collective
95e057df02SPaul Mullowney 
96e057df02SPaul Mullowney   Input Parameters:
9711a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
982ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
992ef1f0ffSBarry Smith            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
10011a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101e057df02SPaul Mullowney 
102e057df02SPaul Mullowney   Level: intermediate
103e057df02SPaul Mullowney 
104fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105e057df02SPaul Mullowney @*/
MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107d71ae5a4SJacob Faibussowitsch {
108e057df02SPaul Mullowney   PetscFunctionBegin;
109e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112e057df02SPaul Mullowney }
113e057df02SPaul Mullowney 
MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)114d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115d71ae5a4SJacob Faibussowitsch {
116365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117365b711fSMark Adams 
118365b711fSMark Adams   PetscFunctionBegin;
119365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
121365b711fSMark Adams }
122365b711fSMark Adams 
123365b711fSMark Adams /*@
12411a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125365b711fSMark Adams 
126365b711fSMark Adams   Input Parameters:
12711a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
12811a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
129365b711fSMark Adams 
1302ef1f0ffSBarry Smith   Level: intermediate
131365b711fSMark Adams 
13211a5261eSBarry Smith   Note:
13353220ed8SBarry Smith   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
13453220ed8SBarry Smith   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136365b711fSMark Adams 
1371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138365b711fSMark Adams @*/
MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)139d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140d71ae5a4SJacob Faibussowitsch {
141365b711fSMark Adams   PetscFunctionBegin;
142365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145365b711fSMark Adams }
146365b711fSMark Adams 
MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)14766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148d71ae5a4SJacob Faibussowitsch {
149e6e9a74fSStefano Zampini   PetscFunctionBegin;
1501a2c6b5cSJunchao Zhang   switch (op) {
1511a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1521a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1539566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1541a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1551a2c6b5cSJunchao Zhang     break;
156d71ae5a4SJacob Faibussowitsch   default:
157d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158d71ae5a4SJacob Faibussowitsch     break;
159e6e9a74fSStefano Zampini   }
1603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
161e6e9a74fSStefano Zampini }
162e6e9a74fSStefano Zampini 
MatSetFromOptions_SeqAIJCUSPARSE(Mat A,PetscOptionItems PetscOptionsObject)163ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164d71ae5a4SJacob Faibussowitsch {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1669ae82921SPaul Mullowney   PetscBool                flg;
167a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1686e111a19SKarl Rupp 
1699ae82921SPaul Mullowney   PetscFunctionBegin;
170d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1719ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1729371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1739566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174afb2bd1cSJunchao Zhang 
1759371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1769566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
1779566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
1789566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1809371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184a435da06SStefano Zampini   #else
185aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186a435da06SStefano Zampini   #endif
1879371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189afb2bd1cSJunchao Zhang 
1909371c9d4SSatish Balay     PetscCall(
1919371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193afb2bd1cSJunchao Zhang #endif
1944c87dfd4SPaul Mullowney   }
195d0609cedSBarry Smith   PetscOptionsHeadEnd();
1963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
199b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)200d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201d460d7bfSJunchao Zhang {
202d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
204d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205421480d9SBarry Smith   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
206d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
207d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
208d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
209d460d7bfSJunchao Zhang 
210d460d7bfSJunchao Zhang   PetscFunctionBegin;
211421480d9SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
212d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
213d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
214d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
215421480d9SBarry Smith       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
216d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
217d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
218d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
219d460d7bfSJunchao Zhang       Mi[0] = 0;
220d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
221d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
222421480d9SBarry Smith         PetscInt ulen = adiag[i] - adiag[i + 1];
223d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
224d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
225421480d9SBarry Smith         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
226d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
227d460d7bfSJunchao Zhang       }
228d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
229f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
230f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
231f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
232f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
233f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
234d460d7bfSJunchao Zhang 
235d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
236d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
237d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
238d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
239d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
240d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
241d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
242d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
243d460d7bfSJunchao Zhang 
244d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
245d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
246d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
247d460d7bfSJunchao Zhang 
248d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
249d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
250d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
251d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
252d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
253d460d7bfSJunchao Zhang 
254d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
256f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
257d460d7bfSJunchao Zhang 
258d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
259d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
260d460d7bfSJunchao Zhang 
261d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
262d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
263d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
264d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
265d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
266d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
267d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
268d460d7bfSJunchao Zhang 
269d460d7bfSJunchao Zhang       // Record for reuse
270d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
271d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
272d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
273d460d7bfSJunchao Zhang     }
274d460d7bfSJunchao Zhang     // Copy the value
275d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
276d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
277d460d7bfSJunchao Zhang     Mnz = Mi[m];
278d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
279d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
280421480d9SBarry Smith       PetscInt ulen = adiag[i] - adiag[i + 1];
281d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
282421480d9SBarry Smith       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
283421480d9SBarry Smith       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
284d460d7bfSJunchao Zhang     }
285d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
286d460d7bfSJunchao Zhang 
287204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
288204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
289204a0e31SJunchao Zhang       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
290204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292204a0e31SJunchao Zhang     } else
293204a0e31SJunchao Zhang   #endif
294204a0e31SJunchao Zhang     {
295d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
296d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
297d460d7bfSJunchao Zhang 
298d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
299204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis          = PETSC_TRUE;
300d460d7bfSJunchao Zhang       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
301d460d7bfSJunchao Zhang     }
302204a0e31SJunchao Zhang   }
303d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
304d460d7bfSJunchao Zhang }
305d460d7bfSJunchao Zhang #else
MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)306d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
307d71ae5a4SJacob Faibussowitsch {
3089ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3099ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3109ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
311aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3129ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3139ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3149ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3159ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3169ae82921SPaul Mullowney 
3179ae82921SPaul Mullowney   PetscFunctionBegin;
3183ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
319c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3209ae82921SPaul Mullowney     try {
3219ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3229ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
323da79fbbcSStefano Zampini       if (!loTriFactor) {
3242cbc15d9SMark         PetscScalar *AALo;
3252cbc15d9SMark 
3269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3309566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3319ae82921SPaul Mullowney 
3329ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3339ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3349ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3359ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3369ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3379ae82921SPaul Mullowney         v         = aa;
3389ae82921SPaul Mullowney         vi        = aj;
3399ae82921SPaul Mullowney         offset    = 1;
3409ae82921SPaul Mullowney         rowOffset = 1;
3419ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3429ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
343e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3449ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3459ae82921SPaul Mullowney           rowOffset += nz + 1;
3469ae82921SPaul Mullowney 
347f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
348f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3499ae82921SPaul Mullowney 
3509ae82921SPaul Mullowney           offset += nz;
3519ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3529ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3539ae82921SPaul Mullowney           offset += 1;
3549ae82921SPaul Mullowney 
3559ae82921SPaul Mullowney           v += nz;
3569ae82921SPaul Mullowney           vi += nz;
3579ae82921SPaul Mullowney         }
3582205254eSKarl Rupp 
359aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3609566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
361da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
362aa372e3fSPaul Mullowney         /* Create the matrix description */
3639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3651b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
367afb2bd1cSJunchao Zhang   #else
3689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
369afb2bd1cSJunchao Zhang   #endif
3709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
372aa372e3fSPaul Mullowney 
373aa372e3fSPaul Mullowney         /* set the operation */
374aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
375aa372e3fSPaul Mullowney 
376aa372e3fSPaul Mullowney         /* set the matrix */
377aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
378aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
379aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
380aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
381aa372e3fSPaul Mullowney 
382aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
383aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
384aa372e3fSPaul Mullowney 
385aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
386aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
387aa372e3fSPaul Mullowney 
388aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
389aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
390aa372e3fSPaul Mullowney 
391afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3929566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
393261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3941b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3959371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3969371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
398afb2bd1cSJunchao Zhang   #endif
399afb2bd1cSJunchao Zhang 
400aa372e3fSPaul Mullowney         /* perform the solve analysis */
4019371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4029f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4039566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4049566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
405aa372e3fSPaul Mullowney 
406da79fbbcSStefano Zampini         /* assign the pointer */
407aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4082cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4099566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4109566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4119566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
412da79fbbcSStefano Zampini       } else { /* update values only */
41348a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
414da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4152cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
416da79fbbcSStefano Zampini         v                    = aa;
417da79fbbcSStefano Zampini         vi                   = aj;
418da79fbbcSStefano Zampini         offset               = 1;
419da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
420da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
421f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
422da79fbbcSStefano Zampini           offset += nz;
4232cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
424da79fbbcSStefano Zampini           offset += 1;
425da79fbbcSStefano Zampini           v += nz;
426da79fbbcSStefano Zampini         }
4272cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4289566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
429da79fbbcSStefano Zampini       }
430d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
431d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
432d71ae5a4SJacob Faibussowitsch     }
4339ae82921SPaul Mullowney   }
4343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4359ae82921SPaul Mullowney }
4369ae82921SPaul Mullowney 
MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)437d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
438d71ae5a4SJacob Faibussowitsch {
4399ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4409ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4419ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
442aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
443421480d9SBarry Smith   const PetscInt                    *aj                 = a->j, *adiag, *vi;
4449ae82921SPaul Mullowney   const MatScalar                   *aa                 = a->a, *v;
4459ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4469ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4479ae82921SPaul Mullowney 
4489ae82921SPaul Mullowney   PetscFunctionBegin;
4493ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
450421480d9SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
451c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4529ae82921SPaul Mullowney     try {
4539ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4549ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
455da79fbbcSStefano Zampini       if (!upTriFactor) {
4562cbc15d9SMark         PetscScalar *AAUp;
4572cbc15d9SMark 
4589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4592cbc15d9SMark 
4609ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4639ae82921SPaul Mullowney 
4649ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4659ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4669ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4679ae82921SPaul Mullowney         offset  = nzUpper;
4689ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4699ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4709ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4719ae82921SPaul Mullowney 
472e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4739ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4749ae82921SPaul Mullowney 
475e057df02SPaul Mullowney           /* decrement the offset */
4769ae82921SPaul Mullowney           offset -= (nz + 1);
4779ae82921SPaul Mullowney 
478e057df02SPaul Mullowney           /* first, set the diagonal elements */
4799ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
48009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4819ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4829ae82921SPaul Mullowney 
483f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
484f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
4859ae82921SPaul Mullowney         }
4862205254eSKarl Rupp 
487aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4889566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
489da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4902205254eSKarl Rupp 
491aa372e3fSPaul Mullowney         /* Create the matrix description */
4929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4941b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
496afb2bd1cSJunchao Zhang   #else
4979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
498afb2bd1cSJunchao Zhang   #endif
4999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5009566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
501aa372e3fSPaul Mullowney 
502aa372e3fSPaul Mullowney         /* set the operation */
503aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
504aa372e3fSPaul Mullowney 
505aa372e3fSPaul Mullowney         /* set the matrix */
506aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
507aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
508aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
509aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
510aa372e3fSPaul Mullowney 
511aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
512aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
513aa372e3fSPaul Mullowney 
514aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
515aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
516aa372e3fSPaul Mullowney 
517aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
518aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
519aa372e3fSPaul Mullowney 
520afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5219566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
522261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5231b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5249371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5259371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
527afb2bd1cSJunchao Zhang   #endif
528afb2bd1cSJunchao Zhang 
529aa372e3fSPaul Mullowney         /* perform the solve analysis */
5309371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5319f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5329f7ba44dSJacob Faibussowitsch 
5339566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5349566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
535aa372e3fSPaul Mullowney 
536da79fbbcSStefano Zampini         /* assign the pointer */
537aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5382cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5399566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5409566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5419566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
542da79fbbcSStefano Zampini       } else {
54348a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
544da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
545da79fbbcSStefano Zampini         offset = nzUpper;
546da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
547da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
548da79fbbcSStefano Zampini 
549da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
550da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
551da79fbbcSStefano Zampini 
552da79fbbcSStefano Zampini           /* decrement the offset */
553da79fbbcSStefano Zampini           offset -= (nz + 1);
554da79fbbcSStefano Zampini 
555da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5562cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
557f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
558da79fbbcSStefano Zampini         }
5592cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5609566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
561da79fbbcSStefano Zampini       }
562d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
563d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
564d71ae5a4SJacob Faibussowitsch     }
5659ae82921SPaul Mullowney   }
5663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5679ae82921SPaul Mullowney }
568d460d7bfSJunchao Zhang #endif
5699ae82921SPaul Mullowney 
MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)570d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
571d71ae5a4SJacob Faibussowitsch {
5729ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5739ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
574c9e33d71SJunchao Zhang   IS                            isrow = a->row, isicol = a->icol;
5759ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5769ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5779ae82921SPaul Mullowney 
5789ae82921SPaul Mullowney   PetscFunctionBegin;
57928b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
580b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
581d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
582d460d7bfSJunchao Zhang #else
5839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
585ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
586d460d7bfSJunchao Zhang #endif
587d460d7bfSJunchao Zhang 
588aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5899ae82921SPaul Mullowney 
590d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
591e057df02SPaul Mullowney   /* lower triangular indices */
5929566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
593da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
594da79fbbcSStefano Zampini     const PetscInt *r;
595da79fbbcSStefano Zampini 
5969566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
597aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
598aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5999566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6009566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
601da79fbbcSStefano Zampini   }
6029ae82921SPaul Mullowney 
603e057df02SPaul Mullowney   /* upper triangular indices */
604c9e33d71SJunchao Zhang   PetscCall(ISIdentity(isicol, &col_identity));
605da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
606da79fbbcSStefano Zampini     const PetscInt *c;
607da79fbbcSStefano Zampini 
608c9e33d71SJunchao Zhang     PetscCall(ISGetIndices(isicol, &c));
609aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
610aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
611c9e33d71SJunchao Zhang     PetscCall(ISRestoreIndices(isicol, &c));
6129566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
613da79fbbcSStefano Zampini   }
6143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6159ae82921SPaul Mullowney }
6169ae82921SPaul Mullowney 
617b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)6185c7eeb11SPierre Jolivet static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
619d460d7bfSJunchao Zhang {
620d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
621d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
622d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
623421480d9SBarry Smith   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
624d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
625d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
626d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
627d460d7bfSJunchao Zhang 
628d460d7bfSJunchao Zhang   PetscFunctionBegin;
629421480d9SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
630d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
631d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
632d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
633d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
634d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
635d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
636d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
637d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
638d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
639d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
640d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
641d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
642d460d7bfSJunchao Zhang       }
643d460d7bfSJunchao Zhang       // Copy M (U) from host to device
644f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
645f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
646f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
647f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
648d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
649d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
650d460d7bfSJunchao Zhang 
651d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
652d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
653d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
654d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
655d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
656d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
657d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
658d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
659d460d7bfSJunchao Zhang 
660d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
661d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
662d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
663d460d7bfSJunchao Zhang 
664d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
665f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
666f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
667d460d7bfSJunchao Zhang 
668d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
669d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
670d460d7bfSJunchao Zhang 
671d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
672d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
673d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
674d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
675d460d7bfSJunchao Zhang 
676aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
677d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
678d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
679d460d7bfSJunchao Zhang 
680d460d7bfSJunchao Zhang       // Record for reuse
681d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
682d460d7bfSJunchao Zhang       fs->diag_h   = D;
683d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
684d460d7bfSJunchao Zhang     }
685d460d7bfSJunchao Zhang     // Copy the value
686d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
687d460d7bfSJunchao Zhang     D   = fs->diag_h;
688d460d7bfSJunchao Zhang     Mnz = Ai[m];
689d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
690421480d9SBarry Smith       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
691d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
692d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
693d460d7bfSJunchao Zhang     }
694d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
695d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
696d460d7bfSJunchao Zhang 
697204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
698204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) {
699204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
700204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701204a0e31SJunchao Zhang     } else
702204a0e31SJunchao Zhang   #endif
703204a0e31SJunchao Zhang     {
704d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
705d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
706d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
707204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis = PETSC_TRUE;
708204a0e31SJunchao Zhang     }
709d460d7bfSJunchao Zhang   }
710d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
711d460d7bfSJunchao Zhang }
712d460d7bfSJunchao Zhang 
713d460d7bfSJunchao Zhang // Solve Ut D U x = b
MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A,Vec b,Vec x)714d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
715d460d7bfSJunchao Zhang {
716d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
717d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
718d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
719d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
720d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
721d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
722d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
723d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
724d460d7bfSJunchao Zhang 
725d460d7bfSJunchao Zhang   PetscFunctionBegin;
726d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
727d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
728d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
729d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
730d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
731d460d7bfSJunchao Zhang 
732d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
733d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
734d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
735d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
736d460d7bfSJunchao Zhang   } else {
737d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
738d460d7bfSJunchao Zhang   }
739d460d7bfSJunchao Zhang 
740d460d7bfSJunchao Zhang   // Solve Ut Y = X
741d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
742d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
743d460d7bfSJunchao Zhang 
744d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
745d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
74629d3d2f8SNuno Nobre   #if CCCL_VERSION >= 3001000
74729d3d2f8SNuno Nobre   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
74829d3d2f8SNuno Nobre   #else
749d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
75029d3d2f8SNuno Nobre   #endif
751d460d7bfSJunchao Zhang 
752d460d7bfSJunchao Zhang   // Solve U X = Y
753d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755d460d7bfSJunchao Zhang   } else {
756d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757d460d7bfSJunchao Zhang   }
758d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759d460d7bfSJunchao Zhang 
760d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
761d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
762d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764d460d7bfSJunchao Zhang   }
765d460d7bfSJunchao Zhang 
766d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
769d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
771d460d7bfSJunchao Zhang }
772d460d7bfSJunchao Zhang #else
MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)773d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774d71ae5a4SJacob Faibussowitsch {
775087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
780087f3262SPaul Mullowney   PetscScalar                       *AAUp;
781087f3262SPaul Mullowney   PetscScalar                       *AALo;
782087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
786087f3262SPaul Mullowney 
787087f3262SPaul Mullowney   PetscFunctionBegin;
7883ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790087f3262SPaul Mullowney     try {
7919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
794087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797087f3262SPaul Mullowney 
798087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
799087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
800087f3262SPaul Mullowney         AiUp[n] = nzUpper;
801087f3262SPaul Mullowney         offset  = 0;
802087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
803087f3262SPaul Mullowney           /* set the pointers */
804087f3262SPaul Mullowney           v  = aa + ai[i];
805087f3262SPaul Mullowney           vj = aj + ai[i];
806087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807087f3262SPaul Mullowney 
808087f3262SPaul Mullowney           /* first, set the diagonal elements */
809087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
81009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
811087f3262SPaul Mullowney           AiUp[i]      = offset;
81209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
813087f3262SPaul Mullowney 
814087f3262SPaul Mullowney           offset += 1;
815087f3262SPaul Mullowney           if (nz > 0) {
816f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
817f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
818087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
819087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
820087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
821087f3262SPaul Mullowney             }
822087f3262SPaul Mullowney             offset += nz;
823087f3262SPaul Mullowney           }
824087f3262SPaul Mullowney         }
825087f3262SPaul Mullowney 
826aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8279566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
828da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829087f3262SPaul Mullowney 
830aa372e3fSPaul Mullowney         /* Create the matrix description */
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8331b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835afb2bd1cSJunchao Zhang   #else
8369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837afb2bd1cSJunchao Zhang   #endif
8389566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840087f3262SPaul Mullowney 
841aa372e3fSPaul Mullowney         /* set the matrix */
842aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852aa372e3fSPaul Mullowney 
853aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855aa372e3fSPaul Mullowney 
856afb2bd1cSJunchao Zhang         /* set the operation */
857afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858afb2bd1cSJunchao Zhang 
859afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8609566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8621b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8639371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8649371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866afb2bd1cSJunchao Zhang   #endif
867afb2bd1cSJunchao Zhang 
868aa372e3fSPaul Mullowney         /* perform the solve analysis */
8699371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8709f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8719f7ba44dSJacob Faibussowitsch 
8729566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8739566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874aa372e3fSPaul Mullowney 
875da79fbbcSStefano Zampini         /* assign the pointer */
876aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877aa372e3fSPaul Mullowney 
878aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8799566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
880da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney         /* Create the matrix description */
8839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8851b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887afb2bd1cSJunchao Zhang   #else
8889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889afb2bd1cSJunchao Zhang   #endif
8909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892aa372e3fSPaul Mullowney 
893aa372e3fSPaul Mullowney         /* set the operation */
894aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895aa372e3fSPaul Mullowney 
896aa372e3fSPaul Mullowney         /* set the matrix */
897aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
898aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
900aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904aa372e3fSPaul Mullowney 
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907aa372e3fSPaul Mullowney 
908aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910aa372e3fSPaul Mullowney 
911afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9129566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9141b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9159371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9169371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918afb2bd1cSJunchao Zhang   #endif
919afb2bd1cSJunchao Zhang 
920aa372e3fSPaul Mullowney         /* perform the solve analysis */
9219371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9229f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9239f7ba44dSJacob Faibussowitsch 
9249566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9259566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926aa372e3fSPaul Mullowney 
927da79fbbcSStefano Zampini         /* assign the pointer */
928aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929087f3262SPaul Mullowney 
9309566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9319566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9329566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
933da79fbbcSStefano Zampini       } else {
934da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
935da79fbbcSStefano Zampini         offset = 0;
936da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
937da79fbbcSStefano Zampini           /* set the pointers */
938da79fbbcSStefano Zampini           v  = aa + ai[i];
939da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940da79fbbcSStefano Zampini 
941da79fbbcSStefano Zampini           /* first, set the diagonal elements */
942da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
943da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
944da79fbbcSStefano Zampini 
945da79fbbcSStefano Zampini           offset += 1;
946da79fbbcSStefano Zampini           if (nz > 0) {
947f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
948da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
949da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
950da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
951da79fbbcSStefano Zampini             }
952da79fbbcSStefano Zampini             offset += nz;
953da79fbbcSStefano Zampini           }
954da79fbbcSStefano Zampini         }
95528b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
95628b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9599566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960da79fbbcSStefano Zampini       }
9619566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
963d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
964d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965d71ae5a4SJacob Faibussowitsch     }
966087f3262SPaul Mullowney   }
9673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
968087f3262SPaul Mullowney }
969d460d7bfSJunchao Zhang #endif
970087f3262SPaul Mullowney 
MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)971d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972d71ae5a4SJacob Faibussowitsch {
973087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975087f3262SPaul Mullowney   IS                            ip                 = a->row;
976087f3262SPaul Mullowney   PetscBool                     perm_identity;
977087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
978087f3262SPaul Mullowney 
979087f3262SPaul Mullowney   PetscFunctionBegin;
98028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981d460d7bfSJunchao Zhang 
982b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
9835c7eeb11SPierre Jolivet   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
984d460d7bfSJunchao Zhang #else
9859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987d460d7bfSJunchao Zhang #endif
988aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989aa372e3fSPaul Mullowney 
990da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
991da79fbbcSStefano Zampini 
992087f3262SPaul Mullowney   /* lower triangular indices */
9939566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
994087f3262SPaul Mullowney   if (!perm_identity) {
9954e4bbfaaSStefano Zampini     IS              iip;
996da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9974e4bbfaaSStefano Zampini 
9989566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
9999566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
10009566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
1001aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10044e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10059566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10069566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10079566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10089566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009da79fbbcSStefano Zampini   }
10103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1011087f3262SPaul Mullowney }
1012087f3262SPaul Mullowney 
MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)1013d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014d71ae5a4SJacob Faibussowitsch {
1015087f3262SPaul Mullowney   PetscFunctionBegin;
10169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10179566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1019d460d7bfSJunchao Zhang 
1020b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023d460d7bfSJunchao Zhang #else
1024087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1025d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026d460d7bfSJunchao Zhang   IS          ip = b->row;
1027d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1028d460d7bfSJunchao Zhang 
10299566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1030087f3262SPaul Mullowney   if (perm_identity) {
1031087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033087f3262SPaul Mullowney   } else {
1034087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036d460d7bfSJunchao Zhang   }
1037d460d7bfSJunchao Zhang #endif
10384e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10394e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1040087f3262SPaul Mullowney 
1041087f3262SPaul Mullowney   /* get the triangular factors */
10429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1044087f3262SPaul Mullowney }
10459ae82921SPaul Mullowney 
1046b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)1047d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048d71ae5a4SJacob Faibussowitsch {
1049bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1055aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1056aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1057aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1058b175d8bbSPaul Mullowney 
1059bda325fcSPaul Mullowney   PetscFunctionBegin;
1060aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10619566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1062da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063aa372e3fSPaul Mullowney 
1064aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1065aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1066aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10679371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* Create the matrix description */
10719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10729566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10739566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10749566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10759566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076aa372e3fSPaul Mullowney 
1077aa372e3fSPaul Mullowney   /* set the operation */
1078aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079aa372e3fSPaul Mullowney 
1080aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1081aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1082afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10919371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10929371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10939371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10949566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095afb2bd1cSJunchao Zhang   #endif
1096afb2bd1cSJunchao Zhang 
10979566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10989f7ba44dSJacob Faibussowitsch   {
10999f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11009f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
11019371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11039f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104afb2bd1cSJunchao Zhang   #else
11059f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106afb2bd1cSJunchao Zhang   #endif
11079f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11089f7ba44dSJacob Faibussowitsch   }
11099f7ba44dSJacob Faibussowitsch 
11109566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112aa372e3fSPaul Mullowney 
1113afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11149566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11161b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11179371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11189371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11199566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120afb2bd1cSJunchao Zhang   #endif
1121afb2bd1cSJunchao Zhang 
1122afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11239371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11249f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11259f7ba44dSJacob Faibussowitsch 
11269566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11279566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128aa372e3fSPaul Mullowney 
1129da79fbbcSStefano Zampini   /* assign the pointer */
1130aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131aa372e3fSPaul Mullowney 
1132aa372e3fSPaul Mullowney   /*********************************************/
1133aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1134aa372e3fSPaul Mullowney   /*********************************************/
1135aa372e3fSPaul Mullowney 
1136aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11379566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1138da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139aa372e3fSPaul Mullowney 
1140aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1141aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1142aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11439371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145aa372e3fSPaul Mullowney 
1146aa372e3fSPaul Mullowney   /* Create the matrix description */
11479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152aa372e3fSPaul Mullowney 
1153aa372e3fSPaul Mullowney   /* set the operation */
1154aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155aa372e3fSPaul Mullowney 
1156aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1157aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1158afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164aa372e3fSPaul Mullowney 
1165aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11679371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11689371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11699371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11709566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171afb2bd1cSJunchao Zhang   #endif
1172afb2bd1cSJunchao Zhang 
11739566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11749f7ba44dSJacob Faibussowitsch   {
11759f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11769f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11779371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11799f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180afb2bd1cSJunchao Zhang   #else
11819f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182afb2bd1cSJunchao Zhang   #endif
11839f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11849f7ba44dSJacob Faibussowitsch   }
1185d49cd2b7SBarry Smith 
11869566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11879566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188aa372e3fSPaul Mullowney 
1189afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11909566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11921b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11949371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11959566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196afb2bd1cSJunchao Zhang   #endif
1197afb2bd1cSJunchao Zhang 
1198afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11995f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12019f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202d49cd2b7SBarry Smith 
12039566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12049566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205aa372e3fSPaul Mullowney 
1206da79fbbcSStefano Zampini   /* assign the pointer */
1207aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1209bda325fcSPaul Mullowney }
1210d460d7bfSJunchao Zhang #endif
1211bda325fcSPaul Mullowney 
12129371c9d4SSatish Balay struct PetscScalarToPetscInt {
operator ()PetscScalarToPetscInt12139371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214a49f1ed0SStefano Zampini };
1215a49f1ed0SStefano Zampini 
MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)1216d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217d71ae5a4SJacob Faibussowitsch {
1218aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1222aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1223b175d8bbSPaul Mullowney 
1224bda325fcSPaul Mullowney   PetscFunctionBegin;
12259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122728b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
122908401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12303ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12319566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
123348a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12389566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12399566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240aa372e3fSPaul Mullowney 
1241b06137fdSPaul Mullowney     /* set alpha and beta */
1242f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1243f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1244f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248b06137fdSPaul Mullowney 
1249aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1251a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1252554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1253554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1254aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1255a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1258a3fdcf43SKarl Rupp 
1259ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
126081902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261afb2bd1cSJunchao Zhang 
1262afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12633606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12649371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12659371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12673606e59fSJunchao Zhang   #else
12683606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12693606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12703606e59fSJunchao Zhang 
12713606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12723606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12733606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12743606e59fSJunchao Zhang         */
12753606e59fSJunchao Zhang       if (matrixT->num_entries) {
12769371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12779371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12783606e59fSJunchao Zhang 
12793606e59fSJunchao Zhang       } else {
12803606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12813606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12823606e59fSJunchao Zhang       }
12833606e59fSJunchao Zhang   #endif
1284afb2bd1cSJunchao Zhang #endif
1285aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288afb2bd1cSJunchao Zhang #else
1289aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
129051c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
129151c6d536SStefano Zampini       /* First convert HYB to CSR */
1292aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1293aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1294aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1295aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1298aa372e3fSPaul Mullowney 
12999371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
13009371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1304aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1305aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1306aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1309aa372e3fSPaul Mullowney 
13109371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13119371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13129371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1313aa372e3fSPaul Mullowney 
1314aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1315aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13169566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13179371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13189371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13199371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1320aa372e3fSPaul Mullowney 
1321aa372e3fSPaul Mullowney       /* assign the pointer */
1322aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13231a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1324aa372e3fSPaul Mullowney       /* delete temporaries */
1325aa372e3fSPaul Mullowney       if (tempT) {
1326aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1330087f3262SPaul Mullowney       }
1331aa372e3fSPaul Mullowney       if (temp) {
1332aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1336aa372e3fSPaul Mullowney       }
1337afb2bd1cSJunchao Zhang #endif
1338aa372e3fSPaul Mullowney     }
1339a49f1ed0SStefano Zampini   }
1340a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
134328b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
134428b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
134528b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134728b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
134928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13549566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355a49f1ed0SStefano Zampini     }
1356a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1357a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359a49f1ed0SStefano Zampini 
1360a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1363a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13649371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13659371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13679566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368a49f1ed0SStefano Zampini #endif
1369a49f1ed0SStefano Zampini 
13701a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13711a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13721a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13731a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13741a2c6b5cSJunchao Zhang 
13751a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13761a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13771a2c6b5cSJunchao Zhang         */
13789371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13809371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13819371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1382a49f1ed0SStefano Zampini #else
13839371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13849371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1385a49f1ed0SStefano Zampini #endif
13861a2c6b5cSJunchao Zhang       } else {
13871a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13881a2c6b5cSJunchao Zhang       }
13891a2c6b5cSJunchao Zhang 
1390a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394a49f1ed0SStefano Zampini #endif
1395a49f1ed0SStefano Zampini     }
13969371c9d4SSatish Balay     PetscCallThrust(
13979371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398a49f1ed0SStefano Zampini   }
13999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14009566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1402213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1403aa372e3fSPaul Mullowney   /* assign the pointer */
1404aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14051a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1407bda325fcSPaul Mullowney }
1408bda325fcSPaul Mullowney 
1409b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatSolve_SeqAIJCUSPARSE_LU(Mat A,Vec b,Vec x)1410d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411d460d7bfSJunchao Zhang {
1412d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1413d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1414d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1415d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1416d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1421d460d7bfSJunchao Zhang 
1422d460d7bfSJunchao Zhang   PetscFunctionBegin;
1423d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1424d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1427d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1428d460d7bfSJunchao Zhang 
1429d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1431d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433d460d7bfSJunchao Zhang   } else {
1434d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435d460d7bfSJunchao Zhang   }
1436d460d7bfSJunchao Zhang 
1437d460d7bfSJunchao Zhang   // Solve L Y = X
1438d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441d460d7bfSJunchao Zhang 
1442d460d7bfSJunchao Zhang   // Solve U X = Y
1443d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1444d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445d460d7bfSJunchao Zhang   } else {
1446d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447d460d7bfSJunchao Zhang   }
1448d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449d460d7bfSJunchao Zhang 
1450d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1451d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1452d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454d460d7bfSJunchao Zhang   }
1455d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1458d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1460d460d7bfSJunchao Zhang }
1461d460d7bfSJunchao Zhang 
MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A,Vec b,Vec x)1462d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463d460d7bfSJunchao Zhang {
1464d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1467d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1468d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1469d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1470d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1473d460d7bfSJunchao Zhang 
1474d460d7bfSJunchao Zhang   PetscFunctionBegin;
1475d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1476d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480d460d7bfSJunchao Zhang 
1481d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486d460d7bfSJunchao Zhang   }
1487d460d7bfSJunchao Zhang 
1488d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1489d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490d460d7bfSJunchao Zhang 
1491d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493d460d7bfSJunchao Zhang   }
1494d460d7bfSJunchao Zhang 
1495d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1498d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1499d460d7bfSJunchao Zhang 
1500d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1502d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504d460d7bfSJunchao Zhang   } else {
1505d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506d460d7bfSJunchao Zhang   }
1507d460d7bfSJunchao Zhang 
1508d460d7bfSJunchao Zhang   // Solve Ut Y = X
1509d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511d460d7bfSJunchao Zhang 
1512d460d7bfSJunchao Zhang   // Solve Lt X = Y
1513d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515d460d7bfSJunchao Zhang   } else {
1516d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517d460d7bfSJunchao Zhang   }
1518d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519d460d7bfSJunchao Zhang 
1520d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1521d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1522d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524d460d7bfSJunchao Zhang   }
1525d460d7bfSJunchao Zhang 
1526d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1529d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1531d460d7bfSJunchao Zhang }
1532d460d7bfSJunchao Zhang #else
1533a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1534d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535d71ae5a4SJacob Faibussowitsch {
1536c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1537465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1538465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1539465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1540465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1541bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545bda325fcSPaul Mullowney 
1546bda325fcSPaul Mullowney   PetscFunctionBegin;
1547aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1548aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552bda325fcSPaul Mullowney   }
1553bda325fcSPaul Mullowney 
1554bda325fcSPaul Mullowney   /* Get the GPU pointers */
15559566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15569566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1558c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1559bda325fcSPaul Mullowney 
15609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1561aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15629371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563aa372e3fSPaul Mullowney 
1564aa372e3fSPaul Mullowney   /* First, solve U */
15659f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15669f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567aa372e3fSPaul Mullowney 
1568aa372e3fSPaul Mullowney   /* Then, solve L */
15699f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15709f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571aa372e3fSPaul Mullowney 
1572aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15739371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574aa372e3fSPaul Mullowney 
1575aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1576a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577bda325fcSPaul Mullowney 
1578bda325fcSPaul Mullowney   /* restore */
15799566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15809566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1584bda325fcSPaul Mullowney }
1585bda325fcSPaul Mullowney 
MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1586d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587d71ae5a4SJacob Faibussowitsch {
1588465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1589465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1590bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594bda325fcSPaul Mullowney 
1595bda325fcSPaul Mullowney   PetscFunctionBegin;
1596aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1597aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601bda325fcSPaul Mullowney   }
1602bda325fcSPaul Mullowney 
1603bda325fcSPaul Mullowney   /* Get the GPU pointers */
16049566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16059566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606bda325fcSPaul Mullowney 
16079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1608aa372e3fSPaul Mullowney   /* First, solve U */
16099f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16109f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611aa372e3fSPaul Mullowney 
1612aa372e3fSPaul Mullowney   /* Then, solve L */
16139f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16149f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615bda325fcSPaul Mullowney 
1616bda325fcSPaul Mullowney   /* restore */
16179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16189566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1622bda325fcSPaul Mullowney }
1623bda325fcSPaul Mullowney 
MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1624d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625d71ae5a4SJacob Faibussowitsch {
1626465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1627465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1628465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1629465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16349ae82921SPaul Mullowney 
16359ae82921SPaul Mullowney   PetscFunctionBegin;
1636e057df02SPaul Mullowney   /* Get the GPU pointers */
16379566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16389566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1640c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16419ae82921SPaul Mullowney 
16429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1643aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16449371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645aa372e3fSPaul Mullowney 
1646aa372e3fSPaul Mullowney   /* Next, solve L */
16479f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16489f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649aa372e3fSPaul Mullowney 
1650aa372e3fSPaul Mullowney   /* Then, solve U */
16519f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16529f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653d49cd2b7SBarry Smith 
16544e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16559371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16569ae82921SPaul Mullowney 
16579566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16589566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16629ae82921SPaul Mullowney }
16639ae82921SPaul Mullowney 
MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1664d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665d71ae5a4SJacob Faibussowitsch {
1666465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1667465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16729ae82921SPaul Mullowney 
16739ae82921SPaul Mullowney   PetscFunctionBegin;
1674e057df02SPaul Mullowney   /* Get the GPU pointers */
16759566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16769566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16779ae82921SPaul Mullowney 
16789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1679aa372e3fSPaul Mullowney   /* First, solve L */
16809f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16819f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682d49cd2b7SBarry Smith 
1683aa372e3fSPaul Mullowney   /* Next, solve U */
16849f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16859f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16869ae82921SPaul Mullowney 
16879566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16889566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16929ae82921SPaul Mullowney }
1693d460d7bfSJunchao Zhang #endif
16949ae82921SPaul Mullowney 
1695b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *)16968eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697d71ae5a4SJacob Faibussowitsch {
1698da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1702da112707SJunchao Zhang   PetscInt                      m, nz;
1703da112707SJunchao Zhang   PetscBool                     flg;
1704da112707SJunchao Zhang 
1705da112707SJunchao Zhang   PetscFunctionBegin;
1706da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1707da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709da112707SJunchao Zhang   }
1710da112707SJunchao Zhang 
1711da112707SJunchao Zhang   /* Copy A's value to fact */
1712da112707SJunchao Zhang   m  = fact->rmap->n;
1713da112707SJunchao Zhang   nz = aij->nz;
1714da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717da112707SJunchao Zhang 
1718bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1719da112707SJunchao Zhang   /* Factorize fact inplace */
17209371c9d4SSatish Balay   if (m)
17219371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1722d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1723da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1724da112707SJunchao Zhang     int              numerical_zero;
1725da112707SJunchao Zhang     cusparseStatus_t status;
1726da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1727da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1728da112707SJunchao Zhang   }
1729da112707SJunchao Zhang 
1730204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1731204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1732204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1733204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1734204a0e31SJunchao Zhang   } else
1735204a0e31SJunchao Zhang   #endif
1736204a0e31SJunchao Zhang   {
173712ba2bc6SJunchao Zhang     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173812ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173912ba2bc6SJunchao Zhang     */
17409371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1741da112707SJunchao Zhang 
17429371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1743da112707SJunchao Zhang 
1744204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
174512ba2bc6SJunchao Zhang     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
174612ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1747204a0e31SJunchao Zhang   }
174812ba2bc6SJunchao Zhang 
1749da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1750d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1751d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1752da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1753da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1754bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1755da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1757da112707SJunchao Zhang }
1758da112707SJunchao Zhang 
MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS,IS,const MatFactorInfo * info)17598eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1760d71ae5a4SJacob Faibussowitsch {
1761da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1762da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1763da112707SJunchao Zhang   PetscInt                      m, nz;
1764da112707SJunchao Zhang 
1765da112707SJunchao Zhang   PetscFunctionBegin;
1766da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1767421480d9SBarry Smith     PetscBool flg, diagDense;
1768da112707SJunchao Zhang 
1769da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1770da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1771da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1772421480d9SBarry Smith     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1773421480d9SBarry Smith     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1774da112707SJunchao Zhang   }
1775da112707SJunchao Zhang 
1776da112707SJunchao Zhang   /* Free the old stale stuff */
1777da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1778da112707SJunchao Zhang 
1779da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1780da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1781da112707SJunchao Zhang    */
1782da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1783da112707SJunchao Zhang 
1784da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1785da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1786da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1787da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1788da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1789da112707SJunchao Zhang 
1790da112707SJunchao Zhang   aij->row = NULL;
1791da112707SJunchao Zhang   aij->col = NULL;
1792da112707SJunchao Zhang 
1793da112707SJunchao Zhang   /* ====================================================================== */
1794da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1795da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1796da112707SJunchao Zhang   /* ====================================================================== */
1797da112707SJunchao Zhang   const int *Ai, *Aj;
1798da112707SJunchao Zhang 
1799da112707SJunchao Zhang   m  = fact->rmap->n;
1800da112707SJunchao Zhang   nz = aij->nz;
1801da112707SJunchao Zhang 
1802f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1803f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1804f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1805d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1806d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1807d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1808da112707SJunchao Zhang 
1809da112707SJunchao Zhang   /* ====================================================================== */
1810da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1811da112707SJunchao Zhang   /* ====================================================================== */
1812da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1813da112707SJunchao Zhang   cusparseDiagType_t diagType;
1814da112707SJunchao Zhang 
1815da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1816da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1817da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1818da112707SJunchao Zhang 
1819da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1820da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1821da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1822da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1823da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1824da112707SJunchao Zhang   */
1825da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1826da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1827d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18299371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1830da112707SJunchao Zhang 
1831da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1832da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1833d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18349371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18359371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1836da112707SJunchao Zhang 
1837da112707SJunchao Zhang   /* ========================================================================= */
1838da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1839da112707SJunchao Zhang   /* ========================================================================= */
1840da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18419371c9d4SSatish Balay   if (m)
18429371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1843d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1844da112707SJunchao Zhang 
1845da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1846da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1847da112707SJunchao Zhang 
1848da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1849da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1850da112707SJunchao Zhang 
1851da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18529371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1853da112707SJunchao Zhang 
1854da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18559371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1856da112707SJunchao Zhang 
1857da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
185812ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
185912ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
186012ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1861da112707SJunchao Zhang    */
186212ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
186312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
186412ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1865da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
186612ba2bc6SJunchao Zhang   } else {
186712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
186812ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1869da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
187012ba2bc6SJunchao Zhang   }
1871da112707SJunchao Zhang 
1872da112707SJunchao Zhang   /* ========================================================================== */
1873da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1874da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1875da112707SJunchao Zhang   /* ========================================================================== */
1876da112707SJunchao Zhang   int              structural_zero;
1877da112707SJunchao Zhang   cusparseStatus_t status;
1878da112707SJunchao Zhang 
1879da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18809371c9d4SSatish Balay   if (m)
18819371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1882d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1883da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
188446aba097SBarry Smith     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1885da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1886da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1887da112707SJunchao Zhang   }
1888da112707SJunchao Zhang 
1889da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18900dd8c0acSJunchao Zhang   {
1891da112707SJunchao Zhang     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1892421480d9SBarry Smith     PetscInt       *Ai, nzRow, nzLeft;
1893421480d9SBarry Smith     const PetscInt *adiag;
1894da112707SJunchao Zhang     PetscLogDouble  flops = 0.0;
1895da112707SJunchao Zhang 
1896421480d9SBarry Smith     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1897da112707SJunchao Zhang     Ai = Aseq->i;
1898da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1899421480d9SBarry Smith       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1900da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1901421480d9SBarry Smith         nzLeft = adiag[i] - Ai[i];
1902da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1903da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1904da112707SJunchao Zhang         */
1905da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1906da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1907da112707SJunchao Zhang       }
1908da112707SJunchao Zhang     }
1909da112707SJunchao Zhang     fs->numericFactFlops = flops;
19100dd8c0acSJunchao Zhang   }
1911da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1913da112707SJunchao Zhang }
1914da112707SJunchao Zhang 
MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)1915d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1916d71ae5a4SJacob Faibussowitsch {
1917da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1918da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1919da112707SJunchao Zhang   const PetscScalar            *barray;
1920da112707SJunchao Zhang   PetscScalar                  *xarray;
1921da112707SJunchao Zhang 
1922da112707SJunchao Zhang   PetscFunctionBegin;
1923da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1924da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1925da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1926da112707SJunchao Zhang 
1927da112707SJunchao Zhang   /* Solve L*y = b */
1928da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1929da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19319371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1932da112707SJunchao Zhang 
1933da112707SJunchao Zhang   /* Solve Lt*x = y */
1934da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19359371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19369371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1937da112707SJunchao Zhang 
1938da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1939da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1940da112707SJunchao Zhang 
1941da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1942da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1944da112707SJunchao Zhang }
1945da112707SJunchao Zhang 
MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *)19468eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1947d71ae5a4SJacob Faibussowitsch {
1948da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1949da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1950da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1951da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1952da112707SJunchao Zhang   PetscInt                      m, nz;
1953da112707SJunchao Zhang   PetscBool                     flg;
1954da112707SJunchao Zhang 
1955da112707SJunchao Zhang   PetscFunctionBegin;
1956da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1957da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1958da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1959da112707SJunchao Zhang   }
1960da112707SJunchao Zhang 
1961da112707SJunchao Zhang   /* Copy A's value to fact */
1962da112707SJunchao Zhang   m  = fact->rmap->n;
1963da112707SJunchao Zhang   nz = aij->nz;
1964da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1965da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1966da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1967da112707SJunchao Zhang 
1968da112707SJunchao Zhang   /* Factorize fact inplace */
1969da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
197046aba097SBarry Smith      csric02() only takes the lower triangular part of matrix A to perform factorization.
1971da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1972da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1973da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1974da112707SJunchao Zhang    */
1975d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1976da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1977da112707SJunchao Zhang     int              numerical_zero;
1978da112707SJunchao Zhang     cusparseStatus_t status;
1979da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1980da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1981da112707SJunchao Zhang   }
1982da112707SJunchao Zhang 
1983204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1984204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1985204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1986204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1987204a0e31SJunchao Zhang   } else
1988204a0e31SJunchao Zhang   #endif
1989204a0e31SJunchao Zhang   {
19909371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1991da112707SJunchao Zhang 
1992da112707SJunchao Zhang     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1993da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1994da112707SJunchao Zhang   */
19959371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1996204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
1997204a0e31SJunchao Zhang   }
1998da112707SJunchao Zhang 
1999da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2000da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2001da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2002da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
2003da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
2004da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
20053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2006da112707SJunchao Zhang }
2007da112707SJunchao Zhang 
MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS,const MatFactorInfo * info)20088eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2009d71ae5a4SJacob Faibussowitsch {
2010da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2011da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2012da112707SJunchao Zhang   PetscInt                      m, nz;
2013da112707SJunchao Zhang 
2014da112707SJunchao Zhang   PetscFunctionBegin;
2015da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2016421480d9SBarry Smith     PetscBool flg, diagDense;
2017da112707SJunchao Zhang 
2018da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2019da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2020da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2021421480d9SBarry Smith     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2022421480d9SBarry Smith     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2023da112707SJunchao Zhang   }
2024da112707SJunchao Zhang 
2025da112707SJunchao Zhang   /* Free the old stale stuff */
2026da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2027da112707SJunchao Zhang 
2028da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2029da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2030da112707SJunchao Zhang    */
2031da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2032da112707SJunchao Zhang 
2033da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2034da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2035da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2036da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2037da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2038da112707SJunchao Zhang 
2039da112707SJunchao Zhang   aij->row = NULL;
2040da112707SJunchao Zhang   aij->col = NULL;
2041da112707SJunchao Zhang 
2042da112707SJunchao Zhang   /* ====================================================================== */
2043da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2044da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2045da112707SJunchao Zhang   /* ====================================================================== */
2046da112707SJunchao Zhang   const int *Ai, *Aj;
2047da112707SJunchao Zhang 
2048da112707SJunchao Zhang   m  = fact->rmap->n;
2049da112707SJunchao Zhang   nz = aij->nz;
2050da112707SJunchao Zhang 
2051f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2052f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2053da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2054da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2055d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2056d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2057da112707SJunchao Zhang 
2058da112707SJunchao Zhang   /* ====================================================================== */
2059da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2060da112707SJunchao Zhang   /* ====================================================================== */
2061da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2062da112707SJunchao Zhang   cusparseDiagType_t diagType;
2063da112707SJunchao Zhang 
2064da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2065da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2066da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2067da112707SJunchao Zhang 
2068da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2069da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2070da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2071da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2072da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2073da112707SJunchao Zhang   */
2074da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2075da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2076d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2079da112707SJunchao Zhang 
2080da112707SJunchao Zhang   /* ========================================================================= */
2081da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2082da112707SJunchao Zhang   /* ========================================================================= */
2083da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2084d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2085da112707SJunchao Zhang 
2086da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2087da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2088da112707SJunchao Zhang 
2089da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2090da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2091da112707SJunchao Zhang 
2092da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2094da112707SJunchao Zhang 
2095da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20969371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2097da112707SJunchao Zhang 
209812ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
209912ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
210012ba2bc6SJunchao Zhang    */
210112ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
210212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
210312ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2104da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
210512ba2bc6SJunchao Zhang   } else {
210612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
210712ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
210812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
210912ba2bc6SJunchao Zhang   }
2110da112707SJunchao Zhang 
2111da112707SJunchao Zhang   /* ========================================================================== */
2112da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2113da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2114da112707SJunchao Zhang   /* ========================================================================== */
2115da112707SJunchao Zhang   int              structural_zero;
2116da112707SJunchao Zhang   cusparseStatus_t status;
2117da112707SJunchao Zhang 
2118da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2119d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2120da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
212146aba097SBarry Smith     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2122da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2123da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2124da112707SJunchao Zhang   }
2125da112707SJunchao Zhang 
2126da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21270dd8c0acSJunchao Zhang   {
2128da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21290dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2130da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2131da112707SJunchao Zhang 
2132da112707SJunchao Zhang     Ai = Aseq->i;
2133da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2134da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2135da112707SJunchao Zhang       if (nzRow > 1) {
2136da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2137da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2138da112707SJunchao Zhang         */
2139da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2140da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2141da112707SJunchao Zhang       }
2142da112707SJunchao Zhang     }
2143da112707SJunchao Zhang     fs->numericFactFlops = flops;
21440dd8c0acSJunchao Zhang   }
2145da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2147da112707SJunchao Zhang }
2148da112707SJunchao Zhang #endif
2149da112707SJunchao Zhang 
MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)2150d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2151d460d7bfSJunchao Zhang {
2152b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2153b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2154d460d7bfSJunchao Zhang 
2155d460d7bfSJunchao Zhang   PetscFunctionBegin;
2156d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2157d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2158d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2159d460d7bfSJunchao Zhang 
2160d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2161b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2162d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2163d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2164d460d7bfSJunchao Zhang #else
2165d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2166d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2167d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2168d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2169d460d7bfSJunchao Zhang 
2170d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2171d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2172d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2173d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2174d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2175d460d7bfSJunchao Zhang     } else {
2176d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2177d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2178d460d7bfSJunchao Zhang     }
2179d460d7bfSJunchao Zhang #endif
2180d460d7bfSJunchao Zhang   }
2181d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2182d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2183d460d7bfSJunchao Zhang 
2184d460d7bfSJunchao Zhang   /* get the triangular factors */
2185d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2186d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2187d460d7bfSJunchao Zhang }
2188d460d7bfSJunchao Zhang 
MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2189d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2190d460d7bfSJunchao Zhang {
2191d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2192d460d7bfSJunchao Zhang 
2193d460d7bfSJunchao Zhang   PetscFunctionBegin;
2194d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2195d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2196d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2197d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2198d460d7bfSJunchao Zhang }
2199d460d7bfSJunchao Zhang 
MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2201d71ae5a4SJacob Faibussowitsch {
2202da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2203da112707SJunchao Zhang 
2204da112707SJunchao Zhang   PetscFunctionBegin;
2205b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2206bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2207f82ac72cSJunchao Zhang   if (!info->factoronhost) {
2208da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2209da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2210bc996fdcSJunchao Zhang   }
2211da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2212da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2213da112707SJunchao Zhang   } else
2214da112707SJunchao Zhang #endif
2215da112707SJunchao Zhang   {
2216da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219da112707SJunchao Zhang   }
22203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2221da112707SJunchao Zhang }
2222da112707SJunchao Zhang 
MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)2223d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2224d71ae5a4SJacob Faibussowitsch {
2225da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2226da112707SJunchao Zhang 
2227da112707SJunchao Zhang   PetscFunctionBegin;
2228b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2229bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2230f82ac72cSJunchao Zhang   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2231da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2232da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2233da112707SJunchao Zhang   } else
2234da112707SJunchao Zhang #endif
2235da112707SJunchao Zhang   {
2236da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2237da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2238da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2239da112707SJunchao Zhang   }
22403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2241da112707SJunchao Zhang }
2242da112707SJunchao Zhang 
MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)2243d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2244d71ae5a4SJacob Faibussowitsch {
2245da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2246da112707SJunchao Zhang 
2247da112707SJunchao Zhang   PetscFunctionBegin;
2248da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2249da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2250da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2252da112707SJunchao Zhang }
2253da112707SJunchao Zhang 
MatFactorGetSolverType_seqaij_cusparse(Mat,MatSolverType * type)225466976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2255d71ae5a4SJacob Faibussowitsch {
2256841d4cb1SJunchao Zhang   PetscFunctionBegin;
2257841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2259841d4cb1SJunchao Zhang }
2260841d4cb1SJunchao Zhang 
2261841d4cb1SJunchao Zhang /*MC
2262841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
226311a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2264841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2265841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
226611a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2267841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2268841d4cb1SJunchao Zhang 
2269841d4cb1SJunchao Zhang   Level: beginner
2270841d4cb1SJunchao Zhang 
22711cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22722ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2273841d4cb1SJunchao Zhang M*/
2274841d4cb1SJunchao Zhang 
MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat * B)2275d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2276d71ae5a4SJacob Faibussowitsch {
2277841d4cb1SJunchao Zhang   PetscInt n = A->rmap->n;
2278841d4cb1SJunchao Zhang 
2279841d4cb1SJunchao Zhang   PetscFunctionBegin;
2280841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2281841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2282b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2283841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2284841d4cb1SJunchao Zhang 
2285841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2286841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2287841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2288841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2289841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2290841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2291841d4cb1SJunchao Zhang     } else {
2292841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2293841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2294841d4cb1SJunchao Zhang     }
2295841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2296841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2297841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2298841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2299841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2300841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2301841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2302841d4cb1SJunchao Zhang     } else {
2303841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2304841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2305841d4cb1SJunchao Zhang     }
2306841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2307841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2308841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2309841d4cb1SJunchao Zhang 
2310841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2311841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2312f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2314841d4cb1SJunchao Zhang }
2315841d4cb1SJunchao Zhang 
MatSeqAIJCUSPARSECopyFromGPU(Mat A)2316d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2317d71ae5a4SJacob Faibussowitsch {
23187e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23197e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2320b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2321da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23220dd8c0acSJunchao Zhang #endif
23237e8381f9SStefano Zampini 
23247e8381f9SStefano Zampini   PetscFunctionBegin;
23257e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23269566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2327da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2328da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23299566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2330da112707SJunchao Zhang     }
2331b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2332da112707SJunchao Zhang     else if (fs->csrVal) {
2333da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2334da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2335da112707SJunchao Zhang     }
2336da112707SJunchao Zhang #endif
23379371c9d4SSatish Balay     else
23389371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23399566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23409566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23417e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23427e8381f9SStefano Zampini   }
23433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23447e8381f9SStefano Zampini }
23457e8381f9SStefano Zampini 
MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2346d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2347d71ae5a4SJacob Faibussowitsch {
23487e8381f9SStefano Zampini   PetscFunctionBegin;
23499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
235067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235267a45760SJunchao Zhang }
235367a45760SJunchao Zhang 
MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2354d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2355d71ae5a4SJacob Faibussowitsch {
235667a45760SJunchao Zhang   PetscFunctionBegin;
23577e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235867a45760SJunchao Zhang   *array         = NULL;
23593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236067a45760SJunchao Zhang }
236167a45760SJunchao Zhang 
MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar * array[])2362d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2363d71ae5a4SJacob Faibussowitsch {
236467a45760SJunchao Zhang   PetscFunctionBegin;
23659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
236667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236867a45760SJunchao Zhang }
236967a45760SJunchao Zhang 
MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat,const PetscScalar * array[])23708eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2371d71ae5a4SJacob Faibussowitsch {
237267a45760SJunchao Zhang   PetscFunctionBegin;
237367a45760SJunchao Zhang   *array = NULL;
23743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237567a45760SJunchao Zhang }
237667a45760SJunchao Zhang 
MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2377d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2378d71ae5a4SJacob Faibussowitsch {
237967a45760SJunchao Zhang   PetscFunctionBegin;
238067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
238267a45760SJunchao Zhang }
238367a45760SJunchao Zhang 
MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2384d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2385d71ae5a4SJacob Faibussowitsch {
238667a45760SJunchao Zhang   PetscFunctionBegin;
238767a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238867a45760SJunchao Zhang   *array         = NULL;
23893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23907e8381f9SStefano Zampini }
23917e8381f9SStefano Zampini 
MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt ** i,const PetscInt ** j,PetscScalar ** a,PetscMemType * mtype)2392d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2393d71ae5a4SJacob Faibussowitsch {
23947ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23957ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23967ee59b9bSJunchao Zhang 
23977ee59b9bSJunchao Zhang   PetscFunctionBegin;
23987ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23997ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
24007ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
24017ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
24027ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
24037ee59b9bSJunchao Zhang 
24047ee59b9bSJunchao Zhang   if (i) {
24057ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24067ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24077ee59b9bSJunchao Zhang #else
24087ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24097ee59b9bSJunchao Zhang #endif
24107ee59b9bSJunchao Zhang   }
24117ee59b9bSJunchao Zhang   if (j) {
24127ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24137ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24147ee59b9bSJunchao Zhang #else
24157ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24167ee59b9bSJunchao Zhang #endif
24177ee59b9bSJunchao Zhang   }
24187ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24197ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24217ee59b9bSJunchao Zhang }
24227ee59b9bSJunchao Zhang 
MatSeqAIJCUSPARSECopyToGPU(Mat A)2423d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2424d71ae5a4SJacob Faibussowitsch {
2425aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24267c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24279ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2428213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2429aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2430abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24319ae82921SPaul Mullowney 
24329ae82921SPaul Mullowney   PetscFunctionBegin;
243328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2434c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2435a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2436a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2437afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243885ba7357SStefano Zampini 
243908401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24409566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2441afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24429566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2443f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24449566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
244634d6c7a5SJose E. Roman     } else {
2447abb89eb1SStefano Zampini       PetscInt nnz;
24489566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24499566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24509566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24517c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
245281902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2453a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2454a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24559ae82921SPaul Mullowney       try {
24569ae82921SPaul Mullowney         if (a->compressedrow.use) {
24579ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24589ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24599ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24609ae82921SPaul Mullowney         } else {
2461213423ffSJunchao Zhang           m    = A->rmap->n;
2462213423ffSJunchao Zhang           ii   = a->i;
2463e6e9a74fSStefano Zampini           ridx = NULL;
24649ae82921SPaul Mullowney         }
246508401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24669371c9d4SSatish Balay         if (!a->a) {
24679371c9d4SSatish Balay           nnz  = ii[m];
24689371c9d4SSatish Balay           both = PETSC_FALSE;
24699371c9d4SSatish Balay         } else nnz = a->nz;
247008401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24719ae82921SPaul Mullowney 
247285ba7357SStefano Zampini         /* create cusparse matrix */
2473abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2474aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24789ae82921SPaul Mullowney 
2479f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2480f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2481f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
24829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2486b06137fdSPaul Mullowney 
2487aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2488aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2489aa372e3fSPaul Mullowney           /* set the matrix */
2490afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2491afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2492afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2493abb89eb1SStefano Zampini           mat->num_entries = nnz;
2494ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2495afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2496ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2497abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2498aa372e3fSPaul Mullowney 
2499ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2500abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2501aa372e3fSPaul Mullowney 
2502aa372e3fSPaul Mullowney           /* assign the pointer */
2503afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2504afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2505afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25069371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25079371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25089371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2509afb2bd1cSJunchao Zhang           }
2510afb2bd1cSJunchao Zhang #endif
2511aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2512afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2513afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2514afb2bd1cSJunchao Zhang #else
2515afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2516afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2517afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2518abb89eb1SStefano Zampini           mat->num_entries = nnz;
2519ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2520afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2521aa372e3fSPaul Mullowney 
2522ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2523abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2524aa372e3fSPaul Mullowney 
2525ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2526abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2527aa372e3fSPaul Mullowney 
2528aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25299566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25309371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25319371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25329371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2533aa372e3fSPaul Mullowney           /* assign the pointer */
2534aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2535aa372e3fSPaul Mullowney 
2536afb2bd1cSJunchao Zhang           if (mat) {
2537afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2538afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2539afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2540afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2541087f3262SPaul Mullowney           }
2542afb2bd1cSJunchao Zhang #endif
2543087f3262SPaul Mullowney         }
2544ca45077fSPaul Mullowney 
2545aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2546213423ffSJunchao Zhang         if (a->compressedrow.use) {
2547ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2548ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2549aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2550213423ffSJunchao Zhang           tmp = m;
2551213423ffSJunchao Zhang         } else {
2552213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2553213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2554213423ffSJunchao Zhang           tmp                        = 0;
2555213423ffSJunchao Zhang         }
25569566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2557aa372e3fSPaul Mullowney 
2558aa372e3fSPaul Mullowney         /* assign the pointer */
2559aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2560d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2561d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2562d71ae5a4SJacob Faibussowitsch       }
25639566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25649566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
256534d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256634d6c7a5SJose E. Roman     }
2567abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25689ae82921SPaul Mullowney   }
25693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25709ae82921SPaul Mullowney }
25719ae82921SPaul Mullowney 
25729371c9d4SSatish Balay struct VecCUDAPlusEquals {
2573aa372e3fSPaul Mullowney   template <typename Tuple>
operator ()VecCUDAPlusEquals2574d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2575d71ae5a4SJacob Faibussowitsch   {
2576aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2577aa372e3fSPaul Mullowney   }
2578aa372e3fSPaul Mullowney };
2579aa372e3fSPaul Mullowney 
25809371c9d4SSatish Balay struct VecCUDAEquals {
25817e8381f9SStefano Zampini   template <typename Tuple>
operator ()VecCUDAEquals2582d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2583d71ae5a4SJacob Faibussowitsch   {
25847e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25857e8381f9SStefano Zampini   }
25867e8381f9SStefano Zampini };
25877e8381f9SStefano Zampini 
25889371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2589e6e9a74fSStefano Zampini   template <typename Tuple>
operator ()VecCUDAEqualsReverse2590d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2591d71ae5a4SJacob Faibussowitsch   {
2592e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2593e6e9a74fSStefano Zampini   }
2594e6e9a74fSStefano Zampini };
2595e6e9a74fSStefano Zampini 
2596cc1eb50dSBarry Smith struct MatProductCtx_MatMatCusparse {
2597ccdfe979SStefano Zampini   PetscBool      cisdense;
2598ccdfe979SStefano Zampini   PetscScalar   *Bt;
2599ccdfe979SStefano Zampini   Mat            X;
2600fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2601fcdce8c4SStefano Zampini   PetscLogDouble flops;
2602fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2603b4285af6SJunchao Zhang 
2604afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2605fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2606afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2607afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2608afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2609afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2610b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2611b4285af6SJunchao Zhang   void *dBuffer4;
2612b4285af6SJunchao Zhang   void *dBuffer5;
2613b4285af6SJunchao Zhang   #endif
2614fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2615fcdce8c4SStefano Zampini   void                 *mmBuffer;
2616fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2617fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2618afb2bd1cSJunchao Zhang #endif
2619afb2bd1cSJunchao Zhang };
2620ccdfe979SStefano Zampini 
MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)2621*2a8381b2SBarry Smith static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
2622d71ae5a4SJacob Faibussowitsch {
2623cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2624ccdfe979SStefano Zampini 
2625ccdfe979SStefano Zampini   PetscFunctionBegin;
26269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2627fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2628afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26299566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26309566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26319566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26329566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2633b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26349566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26359566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2636b4285af6SJunchao Zhang   #endif
26379566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26389566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2639afb2bd1cSJunchao Zhang #endif
26409566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
2641*2a8381b2SBarry Smith   PetscCall(PetscFree(mmdata));
26423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2643ccdfe979SStefano Zampini }
2644ccdfe979SStefano Zampini 
26454742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2646ccdfe979SStefano Zampini 
MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)2647d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2648d71ae5a4SJacob Faibussowitsch {
2649ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2650ccdfe979SStefano Zampini   Mat                           A, B;
2651afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2652ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2653ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2654ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2655ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2656ccdfe979SStefano Zampini   const PetscScalar            *barray;
2657ccdfe979SStefano Zampini   PetscScalar                  *carray;
2658cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2659ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2660ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2661ccdfe979SStefano Zampini 
2662ccdfe979SStefano Zampini   PetscFunctionBegin;
2663ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
266428b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2665cc1eb50dSBarry Smith   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2666ccdfe979SStefano Zampini   A      = product->A;
2667ccdfe979SStefano Zampini   B      = product->B;
26689566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2670ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2671ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
267228b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2674ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2675ccdfe979SStefano Zampini   switch (product->type) {
2676ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2677ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2678ccdfe979SStefano Zampini     mat = cusp->mat;
2679ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2680ccdfe979SStefano Zampini     m   = A->rmap->n;
2681ccdfe979SStefano Zampini     n   = B->cmap->n;
2682ccdfe979SStefano Zampini     break;
2683ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26841a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2685e6e9a74fSStefano Zampini       mat = cusp->mat;
2686e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2687e6e9a74fSStefano Zampini     } else {
26889566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2689ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2690ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2691e6e9a74fSStefano Zampini     }
2692ccdfe979SStefano Zampini     m = A->cmap->n;
2693ccdfe979SStefano Zampini     n = B->cmap->n;
2694ccdfe979SStefano Zampini     break;
2695ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2696ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2697ccdfe979SStefano Zampini     mat = cusp->mat;
2698ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2699ccdfe979SStefano Zampini     m   = A->rmap->n;
2700ccdfe979SStefano Zampini     n   = B->rmap->n;
2701ccdfe979SStefano Zampini     break;
2702d71ae5a4SJacob Faibussowitsch   default:
2703d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2704ccdfe979SStefano Zampini   }
270528b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2706ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2707ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27089566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27099566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2710cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2711afb2bd1cSJunchao Zhang 
27129566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2713c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2714cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27159566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2716c8378d12SStefano Zampini   } else {
2717cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27189566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2719c8378d12SStefano Zampini   }
2720c8378d12SStefano Zampini 
27219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2722afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2723afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2724fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2725fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2726fe5544b9SJunchao Zhang   #else
2727fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2728fe5544b9SJunchao Zhang   #endif
2729fe5544b9SJunchao Zhang 
2730a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2731afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2732fcdce8c4SStefano Zampini     size_t mmBufferSize;
27339371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27349371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27359371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27369371c9d4SSatish Balay     }
2737afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27389566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2740afb2bd1cSJunchao Zhang     }
2741c8378d12SStefano Zampini 
27429371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27439371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27449371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27459371c9d4SSatish Balay     }
2746afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27479566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2748afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2749afb2bd1cSJunchao Zhang     }
2750afb2bd1cSJunchao Zhang 
2751fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2752fe5544b9SJunchao Zhang     if (matADescr) {
275317f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2754fe5544b9SJunchao Zhang       matADescr = NULL;
2755fe5544b9SJunchao Zhang     }
2756fe5544b9SJunchao Zhang   #endif
2757fe5544b9SJunchao Zhang 
2758fe5544b9SJunchao Zhang     if (!matADescr) {
2759fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27609371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27619371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2762afb2bd1cSJunchao Zhang     }
2763fe5544b9SJunchao Zhang 
2764fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2765fe5544b9SJunchao Zhang 
2766fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27679566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27689566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2769fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2770fcdce8c4SStefano Zampini     }
2771fe5544b9SJunchao Zhang 
2772f0b74427SPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2773fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2774fe5544b9SJunchao Zhang   #endif
2775fe5544b9SJunchao Zhang 
2776afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2777afb2bd1cSJunchao Zhang   } else {
2778afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2779fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
27809566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27819566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2782afb2bd1cSJunchao Zhang   }
2783afb2bd1cSJunchao Zhang 
2784afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2785fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2786afb2bd1cSJunchao Zhang #else
2787afb2bd1cSJunchao Zhang   PetscInt k;
2788afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2789ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2790ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2791ccdfe979SStefano Zampini     cublasStatus_t cerr;
2792ccdfe979SStefano Zampini 
27939566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27949371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27959371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2796ccdfe979SStefano Zampini     blda = B->cmap->n;
2797afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2798afb2bd1cSJunchao Zhang   } else {
2799afb2bd1cSJunchao Zhang     k = B->rmap->n;
2800ccdfe979SStefano Zampini   }
2801ccdfe979SStefano Zampini 
2802afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
28039371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
28049371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2805afb2bd1cSJunchao Zhang #endif
28069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2808cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2809ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2810cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28114742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2812ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2813cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28144742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2815ccdfe979SStefano Zampini   } else {
2816cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2817ccdfe979SStefano Zampini   }
281848a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
281948a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2821ccdfe979SStefano Zampini }
2822ccdfe979SStefano Zampini 
MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)2823d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2824d71ae5a4SJacob Faibussowitsch {
2825ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2826ccdfe979SStefano Zampini   Mat                           A, B;
2827ccdfe979SStefano Zampini   PetscInt                      m, n;
2828ccdfe979SStefano Zampini   PetscBool                     cisdense, flg;
2829cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2830ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2831ccdfe979SStefano Zampini 
2832ccdfe979SStefano Zampini   PetscFunctionBegin;
2833ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
283428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2835ccdfe979SStefano Zampini   A = product->A;
2836ccdfe979SStefano Zampini   B = product->B;
28379566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
283828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2839ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
284008401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2841ccdfe979SStefano Zampini   switch (product->type) {
2842ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2843ccdfe979SStefano Zampini     m = A->rmap->n;
2844ccdfe979SStefano Zampini     n = B->cmap->n;
28450e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2846ccdfe979SStefano Zampini     break;
2847ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2848ccdfe979SStefano Zampini     m = A->cmap->n;
2849ccdfe979SStefano Zampini     n = B->cmap->n;
28500e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
28510e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2852ccdfe979SStefano Zampini     break;
2853ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2854ccdfe979SStefano Zampini     m = A->rmap->n;
2855ccdfe979SStefano Zampini     n = B->rmap->n;
28560e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
28570e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2858ccdfe979SStefano Zampini     break;
2859ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2860ccdfe979SStefano Zampini     m = B->cmap->n;
2861ccdfe979SStefano Zampini     n = B->cmap->n;
28620e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
28630e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2864ccdfe979SStefano Zampini     break;
2865ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2866ccdfe979SStefano Zampini     m = B->rmap->n;
2867ccdfe979SStefano Zampini     n = B->rmap->n;
28680e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
28690e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2870ccdfe979SStefano Zampini     break;
2871d71ae5a4SJacob Faibussowitsch   default:
2872d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2873ccdfe979SStefano Zampini   }
28749566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2875ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28769566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28779566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2878ccdfe979SStefano Zampini 
2879ccdfe979SStefano Zampini   /* product data */
28809566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2881ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2882afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2883afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
288448a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2885afb2bd1cSJunchao Zhang #endif
2886ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2887ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28889566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28899566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2890ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28919566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2892ccdfe979SStefano Zampini     } else {
28939566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2894ccdfe979SStefano Zampini     }
2895ccdfe979SStefano Zampini   }
2896ccdfe979SStefano Zampini   C->product->data    = mmdata;
2897cc1eb50dSBarry Smith   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2898ccdfe979SStefano Zampini 
2899ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
29003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2901ccdfe979SStefano Zampini }
2902ccdfe979SStefano Zampini 
MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)2903d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2904d71ae5a4SJacob Faibussowitsch {
2905ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2906fcdce8c4SStefano Zampini   Mat                           A, B;
2907fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2908fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2909fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2910fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2911fcdce8c4SStefano Zampini   PetscBool                     flg;
2912fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2913fcdce8c4SStefano Zampini   MatProductType                ptype;
2914cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2915fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2916fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2917fcdce8c4SStefano Zampini #endif
2918b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2919ccdfe979SStefano Zampini 
2920ccdfe979SStefano Zampini   PetscFunctionBegin;
2921ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
292228b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29239566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
292428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2925cc1eb50dSBarry Smith   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2926fcdce8c4SStefano Zampini   A      = product->A;
2927fcdce8c4SStefano Zampini   B      = product->B;
2928fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2929fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2930fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
293108401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2932fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
293328b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2934fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
293528b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2936fcdce8c4SStefano Zampini     goto finalize;
2937fcdce8c4SStefano Zampini   }
2938fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29399566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
294028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29419566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
294228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
294328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
294428b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2946fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2947fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
294808401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294908401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
295008401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29519566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2953fcdce8c4SStefano Zampini 
2954fcdce8c4SStefano Zampini   ptype = product->type;
2955b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2956fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2958fa046f9fSJunchao Zhang   }
2959b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2960fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
296128b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2962fa046f9fSJunchao Zhang   }
2963fcdce8c4SStefano Zampini   switch (ptype) {
2964fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2965fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2966fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2967fcdce8c4SStefano Zampini     break;
2968fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2969fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2970fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2971fcdce8c4SStefano Zampini     break;
2972fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2973fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2974fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2975fcdce8c4SStefano Zampini     break;
2976d71ae5a4SJacob Faibussowitsch   default:
2977d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2978fcdce8c4SStefano Zampini   }
2979fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
298028b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
298128b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
298228b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2983fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2984fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2985fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
298628b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
298728b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
298828b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2990fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29929566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2993b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29949371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29959371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2996b4285af6SJunchao Zhang   #else
29979371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29989371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29999371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3001b4285af6SJunchao Zhang   #endif
3002fcdce8c4SStefano Zampini #else
30039371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30049371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30059371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3006fcdce8c4SStefano Zampini #endif
30079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30089566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3010fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3011fcdce8c4SStefano Zampini finalize:
3012fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30139566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30149566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30159566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3016fcdce8c4SStefano Zampini   c->reallocs = 0;
3017fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3018fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3019fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3020fcdce8c4SStefano Zampini   C->num_ass++;
30213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3022ccdfe979SStefano Zampini }
3023fcdce8c4SStefano Zampini 
MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)3024d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3025d71ae5a4SJacob Faibussowitsch {
3026fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3027fcdce8c4SStefano Zampini   Mat                           A, B;
3028fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3029fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3030fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3031fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3032fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3033fcdce8c4SStefano Zampini   PetscBool                     flg;
3034fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3035fcdce8c4SStefano Zampini   MatProductType                ptype;
3036cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
3037fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3038fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3039fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3040fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3041fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3042fcdce8c4SStefano Zampini #else
3043fcdce8c4SStefano Zampini   int cnz;
3044fcdce8c4SStefano Zampini #endif
3045b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3046fcdce8c4SStefano Zampini 
3047fcdce8c4SStefano Zampini   PetscFunctionBegin;
3048fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
304928b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3050fcdce8c4SStefano Zampini   A = product->A;
3051fcdce8c4SStefano Zampini   B = product->B;
30529566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
305328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30549566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
305528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3056fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3057fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3058fcdce8c4SStefano Zampini   /* product data */
30599566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3060fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3061cc1eb50dSBarry Smith   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3062fcdce8c4SStefano Zampini 
30639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3065d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3066d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
306708401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
306808401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069d60bce21SJunchao Zhang 
3070fcdce8c4SStefano Zampini   ptype = product->type;
3071b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3072fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3073fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3074fa046f9fSJunchao Zhang   }
3075b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3076fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3077fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3078fa046f9fSJunchao Zhang   }
3079fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3080fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3081fcdce8c4SStefano Zampini   switch (ptype) {
3082fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3083fcdce8c4SStefano Zampini     m    = A->rmap->n;
3084fcdce8c4SStefano Zampini     n    = B->cmap->n;
3085fcdce8c4SStefano Zampini     k    = A->cmap->n;
3086fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3087fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3088fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3089fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3090fcdce8c4SStefano Zampini     break;
3091fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3092fcdce8c4SStefano Zampini     m = A->cmap->n;
3093fcdce8c4SStefano Zampini     n = B->cmap->n;
3094fcdce8c4SStefano Zampini     k = A->rmap->n;
30959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3096fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3097fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3098fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3099fcdce8c4SStefano Zampini     break;
3100fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3101fcdce8c4SStefano Zampini     m = A->rmap->n;
3102fcdce8c4SStefano Zampini     n = B->rmap->n;
3103fcdce8c4SStefano Zampini     k = A->cmap->n;
31049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3105fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3106fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3107fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3108fcdce8c4SStefano Zampini     break;
3109d71ae5a4SJacob Faibussowitsch   default:
3110d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3111fcdce8c4SStefano Zampini   }
3112fcdce8c4SStefano Zampini 
3113fcdce8c4SStefano Zampini   /* create cusparse matrix */
31149566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31159566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3116fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3117fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3118fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3119fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3120fcdce8c4SStefano Zampini 
3121fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3122fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3123fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31249566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31259566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3126fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3127fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3128fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3129fcdce8c4SStefano Zampini   } else {
3130fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3131fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3132fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3133fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3134fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3135fcdce8c4SStefano Zampini   }
3136fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3137fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3138fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3139fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3140fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3141fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3145f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3146f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3147f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3152d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3153fcdce8c4SStefano Zampini     c->nz                = 0;
3154fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3155fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3156fcdce8c4SStefano Zampini     goto finalizesym;
3157fcdce8c4SStefano Zampini   }
3158fcdce8c4SStefano Zampini 
315928b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
316028b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3161fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3162fcdce8c4SStefano Zampini   if (!biscompressed) {
3163fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3164fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3165fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3166fcdce8c4SStefano Zampini #endif
3167fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3168fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3169fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3170fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3171fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3172fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3173fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3174fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3175fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3176fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3177fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31789566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3179fcdce8c4SStefano Zampini     }
3180fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3181fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3182fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31849371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31859371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3186fcdce8c4SStefano Zampini     }
3187fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3188fcdce8c4SStefano Zampini #endif
3189fcdce8c4SStefano Zampini   }
319028b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
319128b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3192fcdce8c4SStefano Zampini   /* precompute flops count */
3193fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3194fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3195fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3196fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3197fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3198fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3199fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3200fcdce8c4SStefano Zampini       }
3201fcdce8c4SStefano Zampini     }
3202fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3203fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3204fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3205fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3206fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3207fcdce8c4SStefano Zampini     }
3208fcdce8c4SStefano Zampini   } else { /* TODO */
3209fcdce8c4SStefano Zampini     flops = 0.;
3210fcdce8c4SStefano Zampini   }
3211fcdce8c4SStefano Zampini 
3212fcdce8c4SStefano Zampini   mmdata->flops = flops;
32139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3214b4285af6SJunchao Zhang 
3215fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32169566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32171ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32181ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3221b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3222b4285af6SJunchao Zhang   {
3223b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3224b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3225b4285af6SJunchao Zhang   */
3226b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3227b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3228b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3229b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3230b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3231b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3232b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3233b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3234b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3235b4285af6SJunchao Zhang 
3236b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32379371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32389371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3240b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32419371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32429371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3243b4285af6SJunchao Zhang 
32449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32499371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32509371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3253b4285af6SJunchao Zhang 
3254b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32559566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3256b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3257b4285af6SJunchao Zhang     /* allocate matrix C */
32589371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32599371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32609371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32619371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3262b4285af6SJunchao Zhang     /* update matC with the new pointers */
32639371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32649371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3265b4285af6SJunchao Zhang 
32669371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32679371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32699371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32709371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32729371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32739371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3275b4285af6SJunchao Zhang   }
3276ae37ee31SJunchao Zhang   #else
3277b4285af6SJunchao Zhang   size_t bufSize2;
3278fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32799371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32809371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32819566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3282fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32839371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32849371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3285fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32869371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32879371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3288fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3289fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3290fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3291fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3292fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32939566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3294fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32959371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32969371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3297fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32989566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3299fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
33009371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
33019371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3302fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33039566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33059566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33069371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33079371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33089371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33099371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3310ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3311fcdce8c4SStefano Zampini #else
33129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33139371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33149371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33159371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3316fcdce8c4SStefano Zampini   c->nz                = cnz;
3317fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3321fcdce8c4SStefano Zampini 
33229566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3323fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3324fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3325fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33269371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33279371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33289371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3329fcdce8c4SStefano Zampini #endif
33309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3332fcdce8c4SStefano Zampini finalizesym:
3333fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33349f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33359f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3336fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33377de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3338fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3339fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3340fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3341fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3342fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3343fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346fcdce8c4SStefano Zampini   } else {
3347fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3348fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351fcdce8c4SStefano Zampini   }
3352fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3353fcdce8c4SStefano Zampini     PetscInt r = 0;
3354fcdce8c4SStefano Zampini     c->i[0]    = 0;
3355fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3356fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3357fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3358fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3359fcdce8c4SStefano Zampini     }
3360fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3361fcdce8c4SStefano Zampini   }
33629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3365fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3366fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3367fcdce8c4SStefano Zampini   c->rmax          = 0;
3368fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3369fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3370fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3371fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3372fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3373fcdce8c4SStefano Zampini   }
33749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3375fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3376fcdce8c4SStefano Zampini 
3377fcdce8c4SStefano Zampini   C->nonzerostate++;
33789566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33799566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3380fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3381fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3382fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3383fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3384fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3385abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3386fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3387fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3388fcdce8c4SStefano Zampini   }
3389fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3391fcdce8c4SStefano Zampini }
3392fcdce8c4SStefano Zampini 
3393fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3394fcdce8c4SStefano Zampini 
3395fcdce8c4SStefano Zampini /* handles sparse or dense B */
MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)3396d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3397d71ae5a4SJacob Faibussowitsch {
3398fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3399fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3400fcdce8c4SStefano Zampini 
3401fcdce8c4SStefano Zampini   PetscFunctionBegin;
3402fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
34039566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
340448a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3405fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3406fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
340748a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3408fcdce8c4SStefano Zampini   }
340965e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
341065e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
341165e4b4d4SStefano Zampini     switch (product->type) {
341265e4b4d4SStefano Zampini     case MATPRODUCT_AB:
341365e4b4d4SStefano Zampini       if (product->api_user) {
3414d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34159566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416d0609cedSBarry Smith         PetscOptionsEnd();
341765e4b4d4SStefano Zampini       } else {
3418d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34199566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3420d0609cedSBarry Smith         PetscOptionsEnd();
342165e4b4d4SStefano Zampini       }
342265e4b4d4SStefano Zampini       break;
342365e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
342465e4b4d4SStefano Zampini       if (product->api_user) {
3425d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34269566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427d0609cedSBarry Smith         PetscOptionsEnd();
342865e4b4d4SStefano Zampini       } else {
3429d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34309566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3431d0609cedSBarry Smith         PetscOptionsEnd();
343265e4b4d4SStefano Zampini       }
343365e4b4d4SStefano Zampini       break;
343465e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
343565e4b4d4SStefano Zampini       if (product->api_user) {
3436d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438d0609cedSBarry Smith         PetscOptionsEnd();
343965e4b4d4SStefano Zampini       } else {
3440d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34419566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3442d0609cedSBarry Smith         PetscOptionsEnd();
344365e4b4d4SStefano Zampini       }
344465e4b4d4SStefano Zampini       break;
344565e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
344665e4b4d4SStefano Zampini       if (product->api_user) {
3447d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449d0609cedSBarry Smith         PetscOptionsEnd();
345065e4b4d4SStefano Zampini       } else {
3451d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34529566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3453d0609cedSBarry Smith         PetscOptionsEnd();
345465e4b4d4SStefano Zampini       }
345565e4b4d4SStefano Zampini       break;
345665e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
345765e4b4d4SStefano Zampini       if (product->api_user) {
3458d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460d0609cedSBarry Smith         PetscOptionsEnd();
346165e4b4d4SStefano Zampini       } else {
3462d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34639566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3464d0609cedSBarry Smith         PetscOptionsEnd();
346565e4b4d4SStefano Zampini       }
346665e4b4d4SStefano Zampini       break;
3467d71ae5a4SJacob Faibussowitsch     default:
3468d71ae5a4SJacob Faibussowitsch       break;
346965e4b4d4SStefano Zampini     }
347065e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
347165e4b4d4SStefano Zampini   }
347265e4b4d4SStefano Zampini   /* dispatch */
3473fcdce8c4SStefano Zampini   if (isdense) {
3474ccdfe979SStefano Zampini     switch (product->type) {
3475ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3476ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3477ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3478ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3479ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3480fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34819566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3482fcdce8c4SStefano Zampini       } else {
3483fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3484fcdce8c4SStefano Zampini       }
3485fcdce8c4SStefano Zampini       break;
3486d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3487d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3488d71ae5a4SJacob Faibussowitsch       break;
3489d71ae5a4SJacob Faibussowitsch     default:
3490d71ae5a4SJacob Faibussowitsch       break;
3491ccdfe979SStefano Zampini     }
3492fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3493fcdce8c4SStefano Zampini     switch (product->type) {
3494fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3495fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3496d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3497d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3498d71ae5a4SJacob Faibussowitsch       break;
3499fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3500fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3501d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3502d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3503d71ae5a4SJacob Faibussowitsch       break;
3504d71ae5a4SJacob Faibussowitsch     default:
3505d71ae5a4SJacob Faibussowitsch       break;
3506fcdce8c4SStefano Zampini     }
3507fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35089566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3509fcdce8c4SStefano Zampini   }
35103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3511ccdfe979SStefano Zampini }
3512ccdfe979SStefano Zampini 
MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)3513d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3514d71ae5a4SJacob Faibussowitsch {
35159ae82921SPaul Mullowney   PetscFunctionBegin;
35169566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3518e6e9a74fSStefano Zampini }
3519e6e9a74fSStefano Zampini 
MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3520d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3521d71ae5a4SJacob Faibussowitsch {
3522e6e9a74fSStefano Zampini   PetscFunctionBegin;
35239566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3525e6e9a74fSStefano Zampini }
3526e6e9a74fSStefano Zampini 
MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)3527d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3528d71ae5a4SJacob Faibussowitsch {
3529e6e9a74fSStefano Zampini   PetscFunctionBegin;
35309566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3532e6e9a74fSStefano Zampini }
3533e6e9a74fSStefano Zampini 
MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3534d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3535d71ae5a4SJacob Faibussowitsch {
3536e6e9a74fSStefano Zampini   PetscFunctionBegin;
35379566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35399ae82921SPaul Mullowney }
35409ae82921SPaul Mullowney 
MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)3541d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3542d71ae5a4SJacob Faibussowitsch {
3543ca45077fSPaul Mullowney   PetscFunctionBegin;
35449566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3546ca45077fSPaul Mullowney }
3547ca45077fSPaul Mullowney 
ScatterAdd(PetscInt n,PetscInt * idx,const PetscScalar * x,PetscScalar * y)3548d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3549d71ae5a4SJacob Faibussowitsch {
3550a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3551a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3552a0e72f99SJunchao Zhang }
3553a0e72f99SJunchao Zhang 
3554afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)3555d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3556d71ae5a4SJacob Faibussowitsch {
35579ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3558aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35599ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3560e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3561e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3562e6e9a74fSStefano Zampini   PetscBool                     compressed;
3563afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3564afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3565afb2bd1cSJunchao Zhang #endif
35666e111a19SKarl Rupp 
35679ae82921SPaul Mullowney   PetscFunctionBegin;
356808401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3569cbc6b225SStefano Zampini   if (!a->nz) {
3570995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3571995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35723ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3573e6e9a74fSStefano Zampini   }
357434d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3576e6e9a74fSStefano Zampini   if (!trans) {
35779ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35785f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3579e6e9a74fSStefano Zampini   } else {
35801a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3581e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3582e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3583e6e9a74fSStefano Zampini     } else {
35849566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3585e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3586e6e9a74fSStefano Zampini     }
3587e6e9a74fSStefano Zampini   }
3588e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3589e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3590213423ffSJunchao Zhang 
3591e6e9a74fSStefano Zampini   try {
35929566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
359369d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35949566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3595afb2bd1cSJunchao Zhang 
35969566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3597e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3598afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3599afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3600afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3601afb2bd1cSJunchao Zhang       */
3602e6e9a74fSStefano Zampini       xptr = xarray;
3603afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3604213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3605afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3606afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3607afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3608afb2bd1cSJunchao Zhang        */
3609afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3610afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3611fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3612afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3613afb2bd1cSJunchao Zhang       }
3614afb2bd1cSJunchao Zhang #endif
3615e6e9a74fSStefano Zampini     } else {
3616afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3617afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3618afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3619afb2bd1cSJunchao Zhang        */
3620afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3621e6e9a74fSStefano Zampini       dptr = zarray;
3622e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3623afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3624e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3625d0967f54SJacob Faibussowitsch 
3626d0967f54SJacob Faibussowitsch         thrust::for_each(
3627d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3628d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3629d0967f54SJacob Faibussowitsch #endif
3630d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36319371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3632e6e9a74fSStefano Zampini       }
3633afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3634afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3635afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3636fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3637afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3638afb2bd1cSJunchao Zhang       }
3639afb2bd1cSJunchao Zhang #endif
3640e6e9a74fSStefano Zampini     }
36419ae82921SPaul Mullowney 
3642afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3643aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3644afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3645fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3646fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3647fe5544b9SJunchao Zhang   #else
3648fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3649fe5544b9SJunchao Zhang   #endif
3650fe5544b9SJunchao Zhang 
36515f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3652fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3653fe5544b9SJunchao Zhang       if (!matDescr) {
3654fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3655fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3656fe5544b9SJunchao Zhang       }
3657fe5544b9SJunchao Zhang   #endif
3658fe5544b9SJunchao Zhang 
3659afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36609566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36629371c9d4SSatish Balay         PetscCallCUSPARSE(
3663fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36649566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3665fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3666fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3667fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3668fe5544b9SJunchao Zhang   #endif
3669afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3670afb2bd1cSJunchao Zhang       } else {
3671afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3674afb2bd1cSJunchao Zhang       }
3675afb2bd1cSJunchao Zhang 
3676fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3677afb2bd1cSJunchao Zhang #else
36787656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36799371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3680afb2bd1cSJunchao Zhang #endif
3681aa372e3fSPaul Mullowney     } else {
3682213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3683afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3684afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3685afb2bd1cSJunchao Zhang #else
3686301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36879371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3688afb2bd1cSJunchao Zhang #endif
3689a65300a6SPaul Mullowney       }
3690aa372e3fSPaul Mullowney     }
36919566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3692aa372e3fSPaul Mullowney 
3693e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3694213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3695213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3696995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3697e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3698995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
36997656d835SStefano Zampini         }
3700213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3701995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
37027656d835SStefano Zampini       }
37037656d835SStefano Zampini 
3704213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3705213423ffSJunchao Zhang       if (compressed) {
37069566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
37076497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37086497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
37099566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3710e6e9a74fSStefano Zampini       }
3711e6e9a74fSStefano Zampini     } else {
3712995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3713e6e9a74fSStefano Zampini     }
37149566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37159566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37169566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3717d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3718d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3719d71ae5a4SJacob Faibussowitsch   }
3720e6e9a74fSStefano Zampini   if (yy) {
37219566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3722e6e9a74fSStefano Zampini   } else {
37239566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3724e6e9a74fSStefano Zampini   }
37253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37269ae82921SPaul Mullowney }
37279ae82921SPaul Mullowney 
MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3728d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3729d71ae5a4SJacob Faibussowitsch {
3730ca45077fSPaul Mullowney   PetscFunctionBegin;
37319566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3733ca45077fSPaul Mullowney }
3734ca45077fSPaul Mullowney 
37359ee18893SBarry Smith PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
37369ee18893SBarry Smith 
GetDiagonal_CSR(const int * row,const int * col,const PetscScalar * val,const PetscInt len,PetscScalar * diag)37379ee18893SBarry Smith __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
37389ee18893SBarry Smith {
37399ee18893SBarry Smith   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
37409ee18893SBarry Smith 
37419ee18893SBarry Smith   if (x < len) {
37429ee18893SBarry Smith     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
37439ee18893SBarry Smith     PetscScalar    d = 0.0;
37449ee18893SBarry Smith 
37459ee18893SBarry Smith     for (PetscInt i = 0; i < num_non0_row; i++) {
37469ee18893SBarry Smith       if (col[i + rowx] == x) {
37479ee18893SBarry Smith         d = val[i + rowx];
37489ee18893SBarry Smith         break;
37499ee18893SBarry Smith       }
37509ee18893SBarry Smith     }
37519ee18893SBarry Smith     diag[x] = d;
37529ee18893SBarry Smith   }
37539ee18893SBarry Smith }
37549ee18893SBarry Smith 
MatGetDiagonal_SeqAIJCUSPARSE(Mat A,Vec diag)37559ee18893SBarry Smith static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
37569ee18893SBarry Smith {
37579ee18893SBarry Smith   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
37589ee18893SBarry Smith   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
37599ee18893SBarry Smith   PetscScalar                  *darray;
37609ee18893SBarry Smith 
37619ee18893SBarry Smith   PetscFunctionBegin;
37629ee18893SBarry Smith   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
37639ee18893SBarry Smith     PetscInt   n   = A->rmap->n;
37649ee18893SBarry Smith     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
37659ee18893SBarry Smith 
37669ee18893SBarry Smith     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
37679ee18893SBarry Smith     if (n > 0) {
37689ee18893SBarry Smith       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
37699ee18893SBarry Smith       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
37709ee18893SBarry Smith       PetscCallCUDA(cudaPeekAtLastError());
37719ee18893SBarry Smith       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
37729ee18893SBarry Smith     }
37739ee18893SBarry Smith   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
37749ee18893SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
37759ee18893SBarry Smith }
37769ee18893SBarry Smith 
MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)3777d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3778d71ae5a4SJacob Faibussowitsch {
3779042217e8SBarry Smith   PetscFunctionBegin;
37809566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37829ae82921SPaul Mullowney }
37839ae82921SPaul Mullowney 
3784e057df02SPaul Mullowney /*@
378553220ed8SBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
37869ae82921SPaul Mullowney 
3787d083f849SBarry Smith   Collective
37889ae82921SPaul Mullowney 
37899ae82921SPaul Mullowney   Input Parameters:
379011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37919ae82921SPaul Mullowney . m    - number of rows
37929ae82921SPaul Mullowney . n    - number of columns
379320f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
379420f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37959ae82921SPaul Mullowney 
37969ae82921SPaul Mullowney   Output Parameter:
37979ae82921SPaul Mullowney . A - the matrix
37989ae82921SPaul Mullowney 
37992ef1f0ffSBarry Smith   Level: intermediate
38002ef1f0ffSBarry Smith 
38012ef1f0ffSBarry Smith   Notes:
38022920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
38032920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
38042920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
38052920cce0SJacob Faibussowitsch 
380611a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
38079ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
380811a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
38099ae82921SPaul Mullowney 
381011a5261eSBarry Smith   The AIJ format, also called
38112ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
38129ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
381320f4b53cSBarry Smith   either one (as in Fortran) or zero.
38149ae82921SPaul Mullowney 
38159ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
38162ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
381720f4b53cSBarry Smith   allocation.
38189ae82921SPaul Mullowney 
381953220ed8SBarry Smith   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
382053220ed8SBarry Smith 
382153220ed8SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
382253220ed8SBarry Smith           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
38239ae82921SPaul Mullowney @*/
MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)3824d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3825d71ae5a4SJacob Faibussowitsch {
38269ae82921SPaul Mullowney   PetscFunctionBegin;
38279566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
38289566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
38299566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
38309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
38313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38329ae82921SPaul Mullowney }
38339ae82921SPaul Mullowney 
MatDestroy_SeqAIJCUSPARSE(Mat A)3834d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3835d71ae5a4SJacob Faibussowitsch {
38369ae82921SPaul Mullowney   PetscFunctionBegin;
38379ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
38382c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
38399ae82921SPaul Mullowney   } else {
38409566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3841aa372e3fSPaul Mullowney   }
38429566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38439566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38469566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38479566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38489566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38499566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38509566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38519566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38529566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38549ae82921SPaul Mullowney }
38559ae82921SPaul Mullowney 
3856ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
385795639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat * B)3858d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3859d71ae5a4SJacob Faibussowitsch {
38609ff858a8SKarl Rupp   PetscFunctionBegin;
38619566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38629566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38649ff858a8SKarl Rupp }
38659ff858a8SKarl Rupp 
MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)3866d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3867d71ae5a4SJacob Faibussowitsch {
3868a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3869039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3870039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3871039c6fbaSStefano Zampini   PetscScalar        *ay;
3872039c6fbaSStefano Zampini   const PetscScalar  *ax;
3873039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3874e6e9a74fSStefano Zampini 
387595639643SRichard Tran Mills   PetscFunctionBegin;
3876a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3877a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3878039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38809566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38813ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
388295639643SRichard Tran Mills   }
3883039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38865f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38875f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3888039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3889039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3890039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3891039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3892039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3893ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3894039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3895039c6fbaSStefano Zampini   }
3896d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3897d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3898039c6fbaSStefano Zampini 
3899039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3900039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3901039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3902039c6fbaSStefano Zampini     size_t bufferSize;
3903039c6fbaSStefano Zampini     void  *buffer;
3904039c6fbaSStefano Zampini #endif
3905039c6fbaSStefano Zampini 
39069566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39089566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3909039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
39109371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39119371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
39129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
39139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39149371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39159371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
39169566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39179566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3919039c6fbaSStefano Zampini #else
39209566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39219371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39229371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
39239566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39249566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3925039c6fbaSStefano Zampini #endif
39269566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
39279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3929039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3930a587d139SMark     cublasHandle_t cublasv2handle;
3931a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3932039c6fbaSStefano Zampini 
39339566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39359566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39369566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
39379566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39389566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
39399566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39409566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39419566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3943039c6fbaSStefano Zampini   } else {
39449566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39459566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3946a587d139SMark   }
39473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
394895639643SRichard Tran Mills }
394995639643SRichard Tran Mills 
MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)3950d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3951d71ae5a4SJacob Faibussowitsch {
395233c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
395333c9ba73SStefano Zampini   PetscScalar   *ay;
395433c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
395533c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
395633c9ba73SStefano Zampini 
395733c9ba73SStefano Zampini   PetscFunctionBegin;
39589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39599566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39609566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39629566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39639566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39649566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
396733c9ba73SStefano Zampini }
396833c9ba73SStefano Zampini 
MatZeroEntries_SeqAIJCUSPARSE(Mat A)3969d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3970d71ae5a4SJacob Faibussowitsch {
3971f5d0f301SBarry Smith   PetscBool   gpu = PETSC_FALSE;
3972a587d139SMark   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;
39737e8381f9SStefano Zampini 
39743fa6b06aSMark Adams   PetscFunctionBegin;
39753fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39763fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39777e8381f9SStefano Zampini     if (spptr->mat) {
39787e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39797e8381f9SStefano Zampini       if (matrix->values) {
3980f5d0f301SBarry Smith         gpu = PETSC_TRUE;
39817e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39827e8381f9SStefano Zampini       }
39837e8381f9SStefano Zampini     }
39847e8381f9SStefano Zampini     if (spptr->matTranspose) {
39857e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3986ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39877e8381f9SStefano Zampini     }
39883fa6b06aSMark Adams   }
3989f5d0f301SBarry Smith   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3990f5d0f301SBarry Smith   else {
39919566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3992f5d0f301SBarry Smith     A->offloadmask = PETSC_OFFLOAD_CPU;
3993f5d0f301SBarry Smith   }
39943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39953fa6b06aSMark Adams }
39963fa6b06aSMark Adams 
MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A,PetscMemType * m)39972c55c4ccSJose E. Roman static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
399803db1824SAlex Lindsay {
399903db1824SAlex Lindsay   PetscFunctionBegin;
400003db1824SAlex Lindsay   *m = PETSC_MEMTYPE_CUDA;
400103db1824SAlex Lindsay   PetscFunctionReturn(PETSC_SUCCESS);
400203db1824SAlex Lindsay }
400303db1824SAlex Lindsay 
MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)4004d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4005d71ae5a4SJacob Faibussowitsch {
4006a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4007a587d139SMark 
4008a587d139SMark   PetscFunctionBegin;
40099a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
40109a14fc28SStefano Zampini     A->boundtocpu = flg;
40113ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
40129a14fc28SStefano Zampini   }
4013a587d139SMark   if (flg) {
40149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4015a587d139SMark 
401633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
40179ee18893SBarry Smith     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4018a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
4019a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4020a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
4021a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4022a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4023a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4024a587d139SMark     A->ops->multhermitiantranspose    = NULL;
4025a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
4026fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
402703db1824SAlex Lindsay     A->ops->getcurrentmemtype         = NULL;
40289566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
40299566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
40309566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
40319566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
40329566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
40339566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
40349566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4035a587d139SMark   } else {
403633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
40379ee18893SBarry Smith     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4038a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4039a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4040a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4041a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4042a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4043a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4044a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4045a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4046fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
404703db1824SAlex Lindsay     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
404867a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
404967a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
405067a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
405167a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
405267a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
405367a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40547ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40557ee59b9bSJunchao Zhang 
40569566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40579566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40589566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40599566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40609566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40619566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4062a587d139SMark   }
4063a587d139SMark   A->boundtocpu = flg;
40644d12350bSJunchao Zhang   if (flg && a->inode.size_csr) {
4065ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4066ea500dcfSRichard Tran Mills   } else {
4067ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4068ea500dcfSRichard Tran Mills   }
40693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4070a587d139SMark }
4071a587d139SMark 
MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A,MatType,MatReuse reuse,Mat * newmat)40728eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4073d71ae5a4SJacob Faibussowitsch {
407449735bf3SStefano Zampini   Mat B;
40759ae82921SPaul Mullowney 
40769ae82921SPaul Mullowney   PetscFunctionBegin;
40779566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
407849735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40799566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
408049735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40819566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
408249735bf3SStefano Zampini   }
408349735bf3SStefano Zampini   B = *newmat;
408449735bf3SStefano Zampini 
40859566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40869566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
408734136279SStefano Zampini 
408849735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40899ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4090e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40919566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40929566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40939566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40941a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4095d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4096b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4097a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4098a435da06SStefano Zampini   #else
4099d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4100a435da06SStefano Zampini   #endif
4101d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4102d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4103d8132acaSStefano Zampini #endif
41041a2c6b5cSJunchao Zhang       B->spptr = spptr;
41059ae82921SPaul Mullowney     } else {
4106e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4107e6e9a74fSStefano Zampini 
41089566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
41099566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
41109566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4111e6e9a74fSStefano Zampini       B->spptr = spptr;
41129ae82921SPaul Mullowney     }
4113e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
411449735bf3SStefano Zampini   }
4115693b0035SStefano Zampini   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
41169ae82921SPaul Mullowney   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
41171a2c6b5cSJunchao Zhang   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
41189ae82921SPaul Mullowney   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
411995639643SRichard Tran Mills   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4120693b0035SStefano Zampini   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
412103db1824SAlex Lindsay   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
41222205254eSKarl Rupp 
41239566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
41249566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
41259566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4126ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
41279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4128ae48a8d0SStefano Zampini #endif
41299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
41303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41319ae82921SPaul Mullowney }
41329ae82921SPaul Mullowney 
MatCreate_SeqAIJCUSPARSE(Mat B)4133d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4134d71ae5a4SJacob Faibussowitsch {
413502fe1965SBarry Smith   PetscFunctionBegin;
41369566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
41379566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
41383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
413902fe1965SBarry Smith }
414002fe1965SBarry Smith 
41413ca39a21SBarry Smith /*MC
414253220ed8SBarry Smith    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4143e057df02SPaul Mullowney 
4144e057df02SPaul Mullowney    Options Database Keys:
414553220ed8SBarry Smith +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
414653220ed8SBarry Smith .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41472ef1f0ffSBarry Smith                                            Other options include ell (ellpack) or hyb (hybrid).
414853220ed8SBarry Smith .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
414953220ed8SBarry Smith -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4150e057df02SPaul Mullowney 
4151e057df02SPaul Mullowney   Level: beginner
4152e057df02SPaul Mullowney 
415353220ed8SBarry Smith   Notes:
415453220ed8SBarry Smith   These matrices can be in either CSR, ELL, or HYB format.
415553220ed8SBarry Smith 
415653220ed8SBarry Smith   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
415753220ed8SBarry Smith 
415853220ed8SBarry Smith   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
415953220ed8SBarry Smith   if some integer values passed in do not fit in `int`.
416053220ed8SBarry Smith 
41611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4162e057df02SPaul Mullowney M*/
41637f756511SDominic Meiser 
MatSolverTypeRegister_CUSPARSE(void)4164d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4165d71ae5a4SJacob Faibussowitsch {
416642c9c57cSBarry Smith   PetscFunctionBegin;
41679566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41689566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41699566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41709566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
417242c9c57cSBarry Smith }
417329b38603SBarry Smith 
MatSeqAIJCUSPARSE_Destroy(Mat mat)41742c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4175d71ae5a4SJacob Faibussowitsch {
41762c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4177cbc6b225SStefano Zampini 
4178cbc6b225SStefano Zampini   PetscFunctionBegin;
41792c4ab24aSJunchao Zhang   if (cusp) {
41802c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41812c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41822c4ab24aSJunchao Zhang     delete cusp->workVector;
41832c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41842c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41852c4ab24aSJunchao Zhang     delete cusp->coords;
41862c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41872c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41887f756511SDominic Meiser   }
41893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41907f756511SDominic Meiser }
41917f756511SDominic Meiser 
CsrMatrix_Destroy(CsrMatrix ** mat)4192d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4193d71ae5a4SJacob Faibussowitsch {
41947f756511SDominic Meiser   PetscFunctionBegin;
41957f756511SDominic Meiser   if (*mat) {
41967f756511SDominic Meiser     delete (*mat)->values;
41977f756511SDominic Meiser     delete (*mat)->column_indices;
41987f756511SDominic Meiser     delete (*mat)->row_offsets;
41997f756511SDominic Meiser     delete *mat;
42007f756511SDominic Meiser     *mat = 0;
42017f756511SDominic Meiser   }
42023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42037f756511SDominic Meiser }
42047f756511SDominic Meiser 
4205b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct ** trifactor)4206d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4207d71ae5a4SJacob Faibussowitsch {
42087f756511SDominic Meiser   PetscFunctionBegin;
42097f756511SDominic Meiser   if (*trifactor) {
42109566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4211261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
42129566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
42139566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
42149566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4215afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
42169566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4217afb2bd1cSJunchao Zhang   #endif
42189566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
42197f756511SDominic Meiser   }
42203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42217f756511SDominic Meiser }
4222d460d7bfSJunchao Zhang #endif
42237f756511SDominic Meiser 
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct ** matstruct,MatCUSPARSEStorageFormat format)4224d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4225d71ae5a4SJacob Faibussowitsch {
42267f756511SDominic Meiser   CsrMatrix *mat;
42277f756511SDominic Meiser 
42287f756511SDominic Meiser   PetscFunctionBegin;
42297f756511SDominic Meiser   if (*matstruct) {
42307f756511SDominic Meiser     if ((*matstruct)->mat) {
42317f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4232afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4233afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4234afb2bd1cSJunchao Zhang #else
42357f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
42369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4237afb2bd1cSJunchao Zhang #endif
42387f756511SDominic Meiser       } else {
42397f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
42403ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
42417f756511SDominic Meiser       }
42427f756511SDominic Meiser     }
42439566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
42447f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
42459566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
42469566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42479566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4248afb2bd1cSJunchao Zhang 
4249afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4250afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42519566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4252fe5544b9SJunchao Zhang 
4253afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4254afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4258fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4259fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4260fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4261fe5544b9SJunchao Zhang   #endif
4262afb2bd1cSJunchao Zhang       }
4263afb2bd1cSJunchao Zhang     }
4264afb2bd1cSJunchao Zhang #endif
42657f756511SDominic Meiser     delete *matstruct;
42667e8381f9SStefano Zampini     *matstruct = NULL;
42677f756511SDominic Meiser   }
42683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42697f756511SDominic Meiser }
42707f756511SDominic Meiser 
MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p * trifactors)4271d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4272d71ae5a4SJacob Faibussowitsch {
4273da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4274da112707SJunchao Zhang 
42757f756511SDominic Meiser   PetscFunctionBegin;
4276da112707SJunchao Zhang   if (fs) {
4277b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4278da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4279da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4280da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4281da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4282d460d7bfSJunchao Zhang     delete fs->workVector;
4283d460d7bfSJunchao Zhang     fs->workVector = NULL;
4284d460d7bfSJunchao Zhang #endif
4285da112707SJunchao Zhang     delete fs->rpermIndices;
4286da112707SJunchao Zhang     delete fs->cpermIndices;
4287da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4288da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4289da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4290b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4291da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4292da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
429330807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
429430807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4295da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4296d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4297da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4298da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
429912ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4300da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4301da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
430212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4303da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4304da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4305da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4306da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4307da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4308da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4309da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4310da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4311da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4312da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4313da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4314da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4315d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4316d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4317d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
431812ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
431912ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4320da112707SJunchao Zhang #endif
4321ccdfe979SStefano Zampini   }
43223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4323ccdfe979SStefano Zampini }
4324ccdfe979SStefano Zampini 
MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors ** trifactors)4325d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4326d71ae5a4SJacob Faibussowitsch {
4327ccdfe979SStefano Zampini   PetscFunctionBegin;
4328ccdfe979SStefano Zampini   if (*trifactors) {
43299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4330f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
43319566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
43327f756511SDominic Meiser   }
43333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43347f756511SDominic Meiser }
43357e8381f9SStefano Zampini 
43369371c9d4SSatish Balay struct IJCompare {
operator ()IJCompare4337d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4338d71ae5a4SJacob Faibussowitsch   {
43390b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
43400b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
43417e8381f9SStefano Zampini     return false;
43427e8381f9SStefano Zampini   }
43437e8381f9SStefano Zampini };
43447e8381f9SStefano Zampini 
MatSeqAIJCUSPARSEInvalidateTranspose(Mat A,PetscBool destroy)434566976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4346d71ae5a4SJacob Faibussowitsch {
4347a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4348a49f1ed0SStefano Zampini 
4349a49f1ed0SStefano Zampini   PetscFunctionBegin;
4350a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43513ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4352a49f1ed0SStefano Zampini   if (destroy) {
43539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4354a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4355a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4356a49f1ed0SStefano Zampini   }
43571a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4359a49f1ed0SStefano Zampini }
4360a49f1ed0SStefano Zampini 
MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)4361*2a8381b2SBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)
4362d71ae5a4SJacob Faibussowitsch {
4363*2a8381b2SBarry Smith   MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;
43644d86920dSPierre Jolivet 
43657e8381f9SStefano Zampini   PetscFunctionBegin;
43662c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43672c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43682c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43707e8381f9SStefano Zampini }
4371ed502f03SStefano Zampini 
MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])437266976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4373d71ae5a4SJacob Faibussowitsch {
43742c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43752c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43762c4ab24aSJunchao Zhang   PetscInt            *i, *j;
437703e76207SPierre Jolivet   PetscContainer       container_h;
43782c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4379219fbbafSJunchao Zhang 
4380219fbbafSJunchao Zhang   PetscFunctionBegin;
43819566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43822c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43832c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43842c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43852c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43862c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43872c4ab24aSJunchao Zhang   } else {
43882c4ab24aSJunchao Zhang     i = coo_i;
43892c4ab24aSJunchao Zhang     j = coo_j;
4390219fbbafSJunchao Zhang   }
4391219fbbafSJunchao Zhang 
43922c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43932c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4394cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43952c4ab24aSJunchao Zhang   // Create the GPU memory
43969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43972c4ab24aSJunchao Zhang 
43982c4ab24aSJunchao Zhang   // Copy the COO struct to device
43992c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4400*2a8381b2SBarry Smith   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
44012c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
44022c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
44032c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
44042c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
44052c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
44062c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
44072c4ab24aSJunchao Zhang 
44082c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
440903e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
44103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4411219fbbafSJunchao Zhang }
4412219fbbafSJunchao Zhang 
MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])4413d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4414d71ae5a4SJacob Faibussowitsch {
4415219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4416219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4417b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4418b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4419b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4420b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4421b6c38306SJunchao Zhang   }
4422219fbbafSJunchao Zhang }
4423219fbbafSJunchao Zhang 
MatSetValuesCOO_SeqAIJCUSPARSE(Mat A,const PetscScalar v[],InsertMode imode)442466976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4425d71ae5a4SJacob Faibussowitsch {
4426219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4427219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4429219fbbafSJunchao Zhang   PetscMemType         memtype;
4430219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4431219fbbafSJunchao Zhang   PetscScalar         *Aa;
44322c4ab24aSJunchao Zhang   PetscContainer       container;
44332c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4434219fbbafSJunchao Zhang 
4435219fbbafSJunchao Zhang   PetscFunctionBegin;
44362c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44372c4ab24aSJunchao Zhang 
44382c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4439*2a8381b2SBarry Smith   PetscCall(PetscContainerGetPointer(container, &coo));
44402c4ab24aSJunchao Zhang 
44419566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4442219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44432c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
44442c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4445219fbbafSJunchao Zhang   }
4446219fbbafSJunchao Zhang 
44479566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44489566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4449219fbbafSJunchao Zhang 
445008bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4451cbc6b225SStefano Zampini   if (Annz) {
44526497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4454cbc6b225SStefano Zampini   }
445508bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4456219fbbafSJunchao Zhang 
44579566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44589566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4459219fbbafSJunchao Zhang 
44609566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4462219fbbafSJunchao Zhang }
4463219fbbafSJunchao Zhang 
44645b7e41feSStefano Zampini /*@C
44652ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44665b7e41feSStefano Zampini 
44672ef1f0ffSBarry Smith   Not Collective
44685b7e41feSStefano Zampini 
44695b7e41feSStefano Zampini   Input Parameters:
44705b7e41feSStefano Zampini + A          - the matrix
447111a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44725b7e41feSStefano Zampini 
44735b7e41feSStefano Zampini   Output Parameters:
447453220ed8SBarry Smith + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
447553220ed8SBarry Smith - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
44765b7e41feSStefano Zampini 
44775b7e41feSStefano Zampini   Level: developer
44785b7e41feSStefano Zampini 
447911a5261eSBarry Smith   Note:
44805b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44815b7e41feSStefano Zampini 
44821cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44835b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSEGetIJ(Mat A,PetscBool compressed,const int ** i,const int ** j)4484d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4485d71ae5a4SJacob Faibussowitsch {
44865f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44875f101d05SStefano Zampini   CsrMatrix          *csr;
44885f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44895f101d05SStefano Zampini 
44905f101d05SStefano Zampini   PetscFunctionBegin;
44915f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44923ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44935f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4494aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
449628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44975f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44985f101d05SStefano Zampini   if (i) {
44995f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
45005f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
45015f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
45025f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
45039566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
45045f101d05SStefano Zampini       }
45055f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
45065f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
45075f101d05SStefano Zampini   }
45085f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
45093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45105f101d05SStefano Zampini }
45115f101d05SStefano Zampini 
45125b7e41feSStefano Zampini /*@C
45132ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
45145b7e41feSStefano Zampini 
45152ef1f0ffSBarry Smith   Not Collective
45165b7e41feSStefano Zampini 
45175b7e41feSStefano Zampini   Input Parameters:
45185b7e41feSStefano Zampini + A          - the matrix
45192ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
452020f4b53cSBarry Smith . i          - the CSR row pointers
452120f4b53cSBarry Smith - j          - the CSR column indices
45225b7e41feSStefano Zampini 
45235b7e41feSStefano Zampini   Level: developer
45245b7e41feSStefano Zampini 
45251cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
45265b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSERestoreIJ(Mat A,PetscBool compressed,const int ** i,const int ** j)452720f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4528d71ae5a4SJacob Faibussowitsch {
45295f101d05SStefano Zampini   PetscFunctionBegin;
45305f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45315f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45325f101d05SStefano Zampini   if (i) *i = NULL;
45335f101d05SStefano Zampini   if (j) *j = NULL;
453420f4b53cSBarry Smith   (void)compressed;
45353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45365f101d05SStefano Zampini }
45375f101d05SStefano Zampini 
45385b7e41feSStefano Zampini /*@C
453953220ed8SBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
45405b7e41feSStefano Zampini 
45415b7e41feSStefano Zampini   Not Collective
45425b7e41feSStefano Zampini 
45435b7e41feSStefano Zampini   Input Parameter:
454411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45455b7e41feSStefano Zampini 
45465b7e41feSStefano Zampini   Output Parameter:
45475b7e41feSStefano Zampini . a - pointer to the device data
45485b7e41feSStefano Zampini 
45495b7e41feSStefano Zampini   Level: developer
45505b7e41feSStefano Zampini 
455111a5261eSBarry Smith   Note:
455253220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
45535b7e41feSStefano Zampini 
45541cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45555b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSEGetArrayRead(Mat A,const PetscScalar ** a)4556d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4557d71ae5a4SJacob Faibussowitsch {
4558ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559ed502f03SStefano Zampini   CsrMatrix          *csr;
4560ed502f03SStefano Zampini 
4561ed502f03SStefano Zampini   PetscFunctionBegin;
4562ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45634f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4564ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
456728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
456928b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570ed502f03SStefano Zampini   *a = csr->values->data().get();
45713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4572ed502f03SStefano Zampini }
4573ed502f03SStefano Zampini 
45745b7e41feSStefano Zampini /*@C
457511a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45765b7e41feSStefano Zampini 
45775b7e41feSStefano Zampini   Not Collective
45785b7e41feSStefano Zampini 
45792ef1f0ffSBarry Smith   Input Parameters:
45802ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45812ef1f0ffSBarry Smith - a - pointer to the device data
45825b7e41feSStefano Zampini 
45835b7e41feSStefano Zampini   Level: developer
45845b7e41feSStefano Zampini 
45851cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45865b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSERestoreArrayRead(Mat A,const PetscScalar ** a)4587d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4588d71ae5a4SJacob Faibussowitsch {
4589ed502f03SStefano Zampini   PetscFunctionBegin;
4590ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45914f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4592ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4593ed502f03SStefano Zampini   *a = NULL;
45943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4595ed502f03SStefano Zampini }
4596ed502f03SStefano Zampini 
45975b7e41feSStefano Zampini /*@C
459811a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45995b7e41feSStefano Zampini 
46005b7e41feSStefano Zampini   Not Collective
46015b7e41feSStefano Zampini 
46025b7e41feSStefano Zampini   Input Parameter:
460311a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46045b7e41feSStefano Zampini 
46055b7e41feSStefano Zampini   Output Parameter:
46065b7e41feSStefano Zampini . a - pointer to the device data
46075b7e41feSStefano Zampini 
46085b7e41feSStefano Zampini   Level: developer
46095b7e41feSStefano Zampini 
461011a5261eSBarry Smith   Note:
461153220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
46125b7e41feSStefano Zampini 
46131cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
46145b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSEGetArray(Mat A,PetscScalar ** a)4615d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4616d71ae5a4SJacob Faibussowitsch {
4617039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4618039c6fbaSStefano Zampini   CsrMatrix          *csr;
4619039c6fbaSStefano Zampini 
4620039c6fbaSStefano Zampini   PetscFunctionBegin;
4621039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46224f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4623039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
46259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
462628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4627039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
462828b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4629039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4630039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4633039c6fbaSStefano Zampini }
46345b7e41feSStefano Zampini /*@C
463511a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4636039c6fbaSStefano Zampini 
46375b7e41feSStefano Zampini   Not Collective
46385b7e41feSStefano Zampini 
46392ef1f0ffSBarry Smith   Input Parameters:
46402ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46412ef1f0ffSBarry Smith - a - pointer to the device data
46425b7e41feSStefano Zampini 
46435b7e41feSStefano Zampini   Level: developer
46445b7e41feSStefano Zampini 
46451cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46465b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSERestoreArray(Mat A,PetscScalar ** a)4647d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4648d71ae5a4SJacob Faibussowitsch {
4649039c6fbaSStefano Zampini   PetscFunctionBegin;
4650039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46514f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4652039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46539566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4654039c6fbaSStefano Zampini   *a = NULL;
46553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4656039c6fbaSStefano Zampini }
4657039c6fbaSStefano Zampini 
46585b7e41feSStefano Zampini /*@C
465911a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46605b7e41feSStefano Zampini 
46615b7e41feSStefano Zampini   Not Collective
46625b7e41feSStefano Zampini 
46635b7e41feSStefano Zampini   Input Parameter:
466411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46655b7e41feSStefano Zampini 
46665b7e41feSStefano Zampini   Output Parameter:
46675b7e41feSStefano Zampini . a - pointer to the device data
46685b7e41feSStefano Zampini 
46695b7e41feSStefano Zampini   Level: developer
46705b7e41feSStefano Zampini 
467111a5261eSBarry Smith   Note:
467253220ed8SBarry Smith   Does not trigger any host to device copies.
467353220ed8SBarry Smith 
467453220ed8SBarry Smith   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
46755b7e41feSStefano Zampini 
46761cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46775b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSEGetArrayWrite(Mat A,PetscScalar ** a)4678d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4679d71ae5a4SJacob Faibussowitsch {
4680ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4681ed502f03SStefano Zampini   CsrMatrix          *csr;
4682ed502f03SStefano Zampini 
4683ed502f03SStefano Zampini   PetscFunctionBegin;
4684ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46854f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4686ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4687aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
468828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4689ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
469028b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4691ed502f03SStefano Zampini   *a             = csr->values->data().get();
4692039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4695ed502f03SStefano Zampini }
4696ed502f03SStefano Zampini 
46975b7e41feSStefano Zampini /*@C
469811a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46995b7e41feSStefano Zampini 
47005b7e41feSStefano Zampini   Not Collective
47015b7e41feSStefano Zampini 
47022ef1f0ffSBarry Smith   Input Parameters:
47032ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
47042ef1f0ffSBarry Smith - a - pointer to the device data
47055b7e41feSStefano Zampini 
47065b7e41feSStefano Zampini   Level: developer
47075b7e41feSStefano Zampini 
47081cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
47095b7e41feSStefano Zampini @*/
MatSeqAIJCUSPARSERestoreArrayWrite(Mat A,PetscScalar ** a)4710d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4711d71ae5a4SJacob Faibussowitsch {
4712ed502f03SStefano Zampini   PetscFunctionBegin;
4713ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
47144f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4715ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
47169566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4717ed502f03SStefano Zampini   *a = NULL;
47183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4719ed502f03SStefano Zampini }
4720ed502f03SStefano Zampini 
47219371c9d4SSatish Balay struct IJCompare4 {
operator ()IJCompare44722d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4723d71ae5a4SJacob Faibussowitsch   {
47240b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
47250b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4726ed502f03SStefano Zampini     return false;
4727ed502f03SStefano Zampini   }
4728ed502f03SStefano Zampini };
4729ed502f03SStefano Zampini 
47309371c9d4SSatish Balay struct Shift {
4731ed502f03SStefano Zampini   int _shift;
4732ed502f03SStefano Zampini 
ShiftShift4733ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
operator ()Shift47349371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4735ed502f03SStefano Zampini };
4736ed502f03SStefano Zampini 
473721afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat * C)4738d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4739d71ae5a4SJacob Faibussowitsch {
4740ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4741ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4742ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4743ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4744ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4745ed502f03SStefano Zampini   cusparseStatus_t              stat;
4746ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4747ed502f03SStefano Zampini 
4748ed502f03SStefano Zampini   PetscFunctionBegin;
4749ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4750ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47514f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4752ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4753ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47545f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
475508401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4756aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4757aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4758ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4759ed502f03SStefano Zampini     m = A->rmap->n;
4760ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47619566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47629566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47639566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4764ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4765ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4766ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4767ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4768ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4769ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4770ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4771ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4772ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4773ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4774ed502f03SStefano Zampini     Ccusp->nrows            = m;
4775ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4776ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4777ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4778ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47799566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47809566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47819566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4782f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4783f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4784f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47859566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47869566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47879566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
479028b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
479128b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4792ed502f03SStefano Zampini 
4793ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4794ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4795ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4796ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4797ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4798ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4799ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4800ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4801ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
48022c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4803ed502f03SStefano Zampini     if (c->nz) {
48042ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
48052ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
48062ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
48072ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
48082ed87e7eSStefano Zampini 
4809ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4810ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4811ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4812ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
48139566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4814ed502f03SStefano Zampini         }
48152ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
48162ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4817ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4818ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4819ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4820ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
48219566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4822ed502f03SStefano Zampini         }
48232ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
48242ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
48259566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48269371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48279371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48289371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48299371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48302ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
48312ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
48322ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
48338909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4834ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4835ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
48368909a122SStefano Zampini #else
48378909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
48388909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48398909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48408909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48418909a122SStefano Zampini #endif
48422ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48432ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48442ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48452ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48462ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48472ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48482c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48492c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
485029d3d2f8SNuno Nobre #if CCCL_VERSION >= 3001000
485129d3d2f8SNuno Nobre       cuda::std::advance(p2, Annz);
485229d3d2f8SNuno Nobre #else
4853ed502f03SStefano Zampini       thrust::advance(p2, Annz);
485429d3d2f8SNuno Nobre #endif
4855792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48568909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48578909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48588909a122SStefano Zampini #endif
48592ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48602ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48612ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4862792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48632ed87e7eSStefano Zampini #else
486459c3d2bbSPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
48652ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
486659c3d2bbSPierre Jolivet   #else
486759c3d2bbSPierre Jolivet       auto pred = cuda::std::identity();
486859c3d2bbSPierre Jolivet   #endif
4869792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4870792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48712ed87e7eSStefano Zampini #endif
48729371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48739371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48749566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48752ed87e7eSStefano Zampini       delete wPerm;
48762ed87e7eSStefano Zampini       delete Acoo;
48772ed87e7eSStefano Zampini       delete Bcoo;
48782ed87e7eSStefano Zampini       delete Ccoo;
4879ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48809371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48819371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4882ed502f03SStefano Zampini #endif
48831a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48849566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48859566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4886ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4887ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4888ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4889ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4890ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4891ed502f03SStefano Zampini 
48921a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48931a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4894a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4895ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4896ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4897ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4898ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4899ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4900ed502f03SStefano Zampini 
4901ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4902ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4903ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4904ed502f03SStefano Zampini 
49059566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4906ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4907ed502f03SStefano Zampini         if (AT) {
4908ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
490929d3d2f8SNuno Nobre #if CCCL_VERSION >= 3001000
491029d3d2f8SNuno Nobre           cuda::std::advance(rT, -1);
491129d3d2f8SNuno Nobre #else
4912ed502f03SStefano Zampini           thrust::advance(rT, -1);
491329d3d2f8SNuno Nobre #endif
4914ed502f03SStefano Zampini         }
4915ed502f03SStefano Zampini         if (BT) {
4916ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4917ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4918ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4919ed502f03SStefano Zampini         }
4920ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4921ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4922ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4923ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4924ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4925ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49269566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4927ed502f03SStefano Zampini 
49289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
49299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
49309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4931f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4932f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4933f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
49349566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49359566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49369566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4937ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
49389371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
49399371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4940ed502f03SStefano Zampini #endif
4941ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4942ed502f03SStefano Zampini       }
4943ed502f03SStefano Zampini     }
4944ed502f03SStefano Zampini 
4945ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
49469f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
49479f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4948ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
49497de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4950ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4951ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4952ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4953ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4956ed502f03SStefano Zampini     } else {
49579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4959ed502f03SStefano Zampini     }
49609566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49619566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49629566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4963ed502f03SStefano Zampini     c->maxnz         = c->nz;
4964ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4965ed502f03SStefano Zampini     c->rmax          = 0;
4966ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4967ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4968ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4969ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4970ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4971ed502f03SStefano Zampini     }
49729566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4973ed502f03SStefano Zampini     (*C)->nonzerostate++;
49749566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49759566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4976ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4977ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4978ed502f03SStefano Zampini   } else {
497908401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4980ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4981ed502f03SStefano Zampini     if (c->nz) {
4982ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49832c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4984aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
498508401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49869566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49879566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49885f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49895f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4990ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4991ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4992ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4993aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4994aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4995aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4996aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49972c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49982c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
499929d3d2f8SNuno Nobre #if CCCL_VERSION >= 3001000
500029d3d2f8SNuno Nobre       cuda::std::advance(pmid, Acsr->num_entries);
500129d3d2f8SNuno Nobre #else
5002ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
500329d3d2f8SNuno Nobre #endif
50049566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
50052c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
50069371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5007ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
50089371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
50092c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5010ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
50119566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
50121a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
50135f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5014ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5015ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5016ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5017ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5018ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
5019ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5020ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
50211a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
5022ed502f03SStefano Zampini       }
50239566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
5024ed502f03SStefano Zampini     }
5025ed502f03SStefano Zampini   }
50269566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5027ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
5028ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
5029ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
50303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5031ed502f03SStefano Zampini }
5032c215019aSStefano Zampini 
MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A,PetscInt n,const PetscInt idx[],PetscScalar v[])5033d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5034d71ae5a4SJacob Faibussowitsch {
5035c215019aSStefano Zampini   bool               dmem;
5036c215019aSStefano Zampini   const PetscScalar *av;
5037c215019aSStefano Zampini 
5038c215019aSStefano Zampini   PetscFunctionBegin;
5039c215019aSStefano Zampini   dmem = isCudaMem(v);
50409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5041c215019aSStefano Zampini   if (n && idx) {
5042c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5043c215019aSStefano Zampini     widx.assign(idx, idx + n);
50449566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5045c215019aSStefano Zampini 
5046c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5047c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5048c215019aSStefano Zampini     if (dmem) {
5049c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5050c215019aSStefano Zampini     } else {
5051c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5052c215019aSStefano Zampini       dv = w->data();
5053c215019aSStefano Zampini     }
5054c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5055c215019aSStefano Zampini 
5056c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5057c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5058c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
505948a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5060c215019aSStefano Zampini     delete w;
5061c215019aSStefano Zampini   } else {
50629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5063c215019aSStefano Zampini   }
50649566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5067c215019aSStefano Zampini }
5068