xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 59c3d2bb33742d69e283a65b5862635693bba9f3)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14
18d0967f54SJacob Faibussowitsch #endif
19a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
20a2cee5feSJed Brown #include <thrust/remove.h>
21a2cee5feSJed Brown #include <thrust/sort.h>
22a2cee5feSJed Brown #include <thrust/unique.h>
23*59c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24*59c3d2bbSPierre Jolivet   #include <cuda/std/functional>
25*59c3d2bbSPierre Jolivet #endif
26e8d2b73aSMark Adams 
27b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
28e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
29afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
30afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32afb2bd1cSJunchao Zhang 
33afb2bd1cSJunchao Zhang   typedef enum {
34afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
35afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
36afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
37afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
38afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
39afb2bd1cSJunchao Zhang 
40afb2bd1cSJunchao Zhang   typedef enum {
41afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
42afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
43afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
44afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
45afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
50afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
51afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
52afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
53afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
54afb2bd1cSJunchao Zhang 
55afb2bd1cSJunchao Zhang   typedef enum {
5635cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5735cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
58afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
59afb2bd1cSJunchao Zhang   */
60afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
61afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
62afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
63afb2bd1cSJunchao Zhang #endif
649ae82921SPaul Mullowney 
65087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
66087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
67087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
686fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
706fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
716fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
72d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
75d460d7bfSJunchao Zhang #endif
76ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
77a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
796fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
816fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
826fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
84e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
85e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
869ae82921SPaul Mullowney 
877f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
902c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
917f756511SDominic Meiser 
9257181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9457181aedSStefano Zampini 
95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98c215019aSStefano Zampini 
99d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100d71ae5a4SJacob Faibussowitsch {
101aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1026e111a19SKarl Rupp 
103ca45077fSPaul Mullowney   PetscFunctionBegin;
104ca45077fSPaul Mullowney   switch (op) {
105d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
106d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
107d71ae5a4SJacob Faibussowitsch     break;
108d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
109d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
110d71ae5a4SJacob Faibussowitsch     break;
111d71ae5a4SJacob Faibussowitsch   default:
112d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113ca45077fSPaul Mullowney   }
1143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
115ca45077fSPaul Mullowney }
1169ae82921SPaul Mullowney 
117e057df02SPaul Mullowney /*@
11811a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11911a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
12011a5261eSBarry Smith 
121e057df02SPaul Mullowney   Not Collective
122e057df02SPaul Mullowney 
123e057df02SPaul Mullowney   Input Parameters:
12411a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
1252ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
1262ef1f0ffSBarry Smith         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12711a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128e057df02SPaul Mullowney 
129e057df02SPaul Mullowney   Level: intermediate
130e057df02SPaul Mullowney 
131fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
132e057df02SPaul Mullowney @*/
133d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
134d71ae5a4SJacob Faibussowitsch {
135e057df02SPaul Mullowney   PetscFunctionBegin;
136e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
137cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
139e057df02SPaul Mullowney }
140e057df02SPaul Mullowney 
141d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
142d71ae5a4SJacob Faibussowitsch {
143365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
144365b711fSMark Adams 
145365b711fSMark Adams   PetscFunctionBegin;
146365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
148365b711fSMark Adams }
149365b711fSMark Adams 
150365b711fSMark Adams /*@
15111a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
152365b711fSMark Adams 
153365b711fSMark Adams   Input Parameters:
15411a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
15511a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
156365b711fSMark Adams 
1572ef1f0ffSBarry Smith   Level: intermediate
158365b711fSMark Adams 
15911a5261eSBarry Smith   Note:
160365b711fSMark Adams   The cuSparse LU solver currently computes the factors with the built-in CPU method
161365b711fSMark Adams   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
162365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
163365b711fSMark Adams 
1641cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
165365b711fSMark Adams @*/
166d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
167d71ae5a4SJacob Faibussowitsch {
168365b711fSMark Adams   PetscFunctionBegin;
169365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
170cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
172365b711fSMark Adams }
173365b711fSMark Adams 
17466976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
175d71ae5a4SJacob Faibussowitsch {
176e6e9a74fSStefano Zampini   PetscFunctionBegin;
1771a2c6b5cSJunchao Zhang   switch (op) {
1781a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1791a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1809566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1811a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1821a2c6b5cSJunchao Zhang     break;
183d71ae5a4SJacob Faibussowitsch   default:
184d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
185d71ae5a4SJacob Faibussowitsch     break;
186e6e9a74fSStefano Zampini   }
1873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
188e6e9a74fSStefano Zampini }
189e6e9a74fSStefano Zampini 
190ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
191d71ae5a4SJacob Faibussowitsch {
192e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1939ae82921SPaul Mullowney   PetscBool                flg;
194a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1956e111a19SKarl Rupp 
1969ae82921SPaul Mullowney   PetscFunctionBegin;
197d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1989ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1999371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2009566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
201afb2bd1cSJunchao Zhang 
2029371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2039566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2049566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2059566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
206afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2079371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
208afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
209b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
210aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211a435da06SStefano Zampini   #else
212aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
213a435da06SStefano Zampini   #endif
2149371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
215aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
216afb2bd1cSJunchao Zhang 
2179371c9d4SSatish Balay     PetscCall(
2189371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
219aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
220afb2bd1cSJunchao Zhang #endif
2214c87dfd4SPaul Mullowney   }
222d0609cedSBarry Smith   PetscOptionsHeadEnd();
2233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2249ae82921SPaul Mullowney }
2259ae82921SPaul Mullowney 
226b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
227d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
228d460d7bfSJunchao Zhang {
229d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
230d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
231d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
232d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
233d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
234d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
235d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
236d460d7bfSJunchao Zhang 
237d460d7bfSJunchao Zhang   PetscFunctionBegin;
238d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
239d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
240d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
241d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
242d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
243d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
244d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
245d460d7bfSJunchao Zhang       Mi[0] = 0;
246d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
247d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
248d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
249d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
250d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
251d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
252d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
253d460d7bfSJunchao Zhang       }
254d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
256f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
257f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
258f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
259f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
260d460d7bfSJunchao Zhang 
261d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
262d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
263d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
264d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
265d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
266d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
267d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
268d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
269d460d7bfSJunchao Zhang 
270d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
271d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
272d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
273d460d7bfSJunchao Zhang 
274d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
275d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
276d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
277d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
278d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
279d460d7bfSJunchao Zhang 
280d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
281f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
282f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
283d460d7bfSJunchao Zhang 
284d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
285d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
286d460d7bfSJunchao Zhang 
287d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
288d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
289d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
290d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
291d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
292d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
293d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
294d460d7bfSJunchao Zhang 
295d460d7bfSJunchao Zhang       // Record for reuse
296d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
297d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
298d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
299d460d7bfSJunchao Zhang     }
300d460d7bfSJunchao Zhang     // Copy the value
301d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
302d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
303d460d7bfSJunchao Zhang     Mnz = Mi[m];
304d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
305d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
306d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
307d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
308d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
309d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
310d460d7bfSJunchao Zhang     }
311d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
312d460d7bfSJunchao Zhang 
313204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
314204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
315204a0e31SJunchao Zhang       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
316204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
318204a0e31SJunchao Zhang     } else
319204a0e31SJunchao Zhang   #endif
320204a0e31SJunchao Zhang     {
321d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
322d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
323d460d7bfSJunchao Zhang 
324d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
325204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis          = PETSC_TRUE;
326d460d7bfSJunchao Zhang       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
327d460d7bfSJunchao Zhang     }
328204a0e31SJunchao Zhang   }
329d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
330d460d7bfSJunchao Zhang }
331d460d7bfSJunchao Zhang #else
332d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
333d71ae5a4SJacob Faibussowitsch {
3349ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3359ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3369ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
337aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3389ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3399ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3409ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3419ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3429ae82921SPaul Mullowney 
3439ae82921SPaul Mullowney   PetscFunctionBegin;
3443ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
345c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3469ae82921SPaul Mullowney     try {
3479ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3489ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
349da79fbbcSStefano Zampini       if (!loTriFactor) {
3502cbc15d9SMark         PetscScalar *AALo;
3512cbc15d9SMark 
3529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3539ae82921SPaul Mullowney 
3549ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3599ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3609ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3619ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3629ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3639ae82921SPaul Mullowney         v         = aa;
3649ae82921SPaul Mullowney         vi        = aj;
3659ae82921SPaul Mullowney         offset    = 1;
3669ae82921SPaul Mullowney         rowOffset = 1;
3679ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3689ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
369e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3709ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3719ae82921SPaul Mullowney           rowOffset += nz + 1;
3729ae82921SPaul Mullowney 
373f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
374f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3759ae82921SPaul Mullowney 
3769ae82921SPaul Mullowney           offset += nz;
3779ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3789ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3799ae82921SPaul Mullowney           offset += 1;
3809ae82921SPaul Mullowney 
3819ae82921SPaul Mullowney           v += nz;
3829ae82921SPaul Mullowney           vi += nz;
3839ae82921SPaul Mullowney         }
3842205254eSKarl Rupp 
385aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3869566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
387da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
388aa372e3fSPaul Mullowney         /* Create the matrix description */
3899566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3911b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
393afb2bd1cSJunchao Zhang   #else
3949566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
395afb2bd1cSJunchao Zhang   #endif
3969566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
398aa372e3fSPaul Mullowney 
399aa372e3fSPaul Mullowney         /* set the operation */
400aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401aa372e3fSPaul Mullowney 
402aa372e3fSPaul Mullowney         /* set the matrix */
403aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
404aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
405aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
406aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
407aa372e3fSPaul Mullowney 
408aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
409aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
410aa372e3fSPaul Mullowney 
411aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
412aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
413aa372e3fSPaul Mullowney 
414aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
415aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
416aa372e3fSPaul Mullowney 
417afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4189566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
419261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
4201b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4219371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4229371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
4239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
424afb2bd1cSJunchao Zhang   #endif
425afb2bd1cSJunchao Zhang 
426aa372e3fSPaul Mullowney         /* perform the solve analysis */
4279371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4289f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4299566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4309566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
431aa372e3fSPaul Mullowney 
432da79fbbcSStefano Zampini         /* assign the pointer */
433aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4342cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4359566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4369566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4379566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
438da79fbbcSStefano Zampini       } else { /* update values only */
43948a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
440da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4412cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
442da79fbbcSStefano Zampini         v                    = aa;
443da79fbbcSStefano Zampini         vi                   = aj;
444da79fbbcSStefano Zampini         offset               = 1;
445da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
446da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
447f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
448da79fbbcSStefano Zampini           offset += nz;
4492cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
450da79fbbcSStefano Zampini           offset += 1;
451da79fbbcSStefano Zampini           v += nz;
452da79fbbcSStefano Zampini         }
4532cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4549566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
455da79fbbcSStefano Zampini       }
456d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
457d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
458d71ae5a4SJacob Faibussowitsch     }
4599ae82921SPaul Mullowney   }
4603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4619ae82921SPaul Mullowney }
4629ae82921SPaul Mullowney 
463d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
464d71ae5a4SJacob Faibussowitsch {
4659ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4669ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4679ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
468aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4699ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4709ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4719ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4729ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4739ae82921SPaul Mullowney 
4749ae82921SPaul Mullowney   PetscFunctionBegin;
4753ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
476c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4779ae82921SPaul Mullowney     try {
4789ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4799ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
480da79fbbcSStefano Zampini       if (!upTriFactor) {
4812cbc15d9SMark         PetscScalar *AAUp;
4822cbc15d9SMark 
4839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4842cbc15d9SMark 
4859ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4889ae82921SPaul Mullowney 
4899ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4909ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4919ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4929ae82921SPaul Mullowney         offset  = nzUpper;
4939ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4949ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4959ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4969ae82921SPaul Mullowney 
497e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4989ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4999ae82921SPaul Mullowney 
500e057df02SPaul Mullowney           /* decrement the offset */
5019ae82921SPaul Mullowney           offset -= (nz + 1);
5029ae82921SPaul Mullowney 
503e057df02SPaul Mullowney           /* first, set the diagonal elements */
5049ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
50509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
5069ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
5079ae82921SPaul Mullowney 
508f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
509f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
5109ae82921SPaul Mullowney         }
5112205254eSKarl Rupp 
512aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5139566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
514da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5152205254eSKarl Rupp 
516aa372e3fSPaul Mullowney         /* Create the matrix description */
5179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5191b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
521afb2bd1cSJunchao Zhang   #else
5229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
523afb2bd1cSJunchao Zhang   #endif
5249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
526aa372e3fSPaul Mullowney 
527aa372e3fSPaul Mullowney         /* set the operation */
528aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
529aa372e3fSPaul Mullowney 
530aa372e3fSPaul Mullowney         /* set the matrix */
531aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
532aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
533aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
534aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
535aa372e3fSPaul Mullowney 
536aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
537aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
538aa372e3fSPaul Mullowney 
539aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
540aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
541aa372e3fSPaul Mullowney 
542aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
543aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
544aa372e3fSPaul Mullowney 
545afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5469566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
547261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5481b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5499371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5509371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5519566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
552afb2bd1cSJunchao Zhang   #endif
553afb2bd1cSJunchao Zhang 
554aa372e3fSPaul Mullowney         /* perform the solve analysis */
5559371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5569f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5579f7ba44dSJacob Faibussowitsch 
5589566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5599566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
560aa372e3fSPaul Mullowney 
561da79fbbcSStefano Zampini         /* assign the pointer */
562aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5632cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5649566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5669566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
567da79fbbcSStefano Zampini       } else {
56848a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
569da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
570da79fbbcSStefano Zampini         offset = nzUpper;
571da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
572da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
573da79fbbcSStefano Zampini 
574da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
575da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
576da79fbbcSStefano Zampini 
577da79fbbcSStefano Zampini           /* decrement the offset */
578da79fbbcSStefano Zampini           offset -= (nz + 1);
579da79fbbcSStefano Zampini 
580da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5812cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
582f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
583da79fbbcSStefano Zampini         }
5842cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5859566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
586da79fbbcSStefano Zampini       }
587d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
588d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
589d71ae5a4SJacob Faibussowitsch     }
5909ae82921SPaul Mullowney   }
5913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5929ae82921SPaul Mullowney }
593d460d7bfSJunchao Zhang #endif
5949ae82921SPaul Mullowney 
595d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
596d71ae5a4SJacob Faibussowitsch {
5979ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5989ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
599c9e33d71SJunchao Zhang   IS                            isrow = a->row, isicol = a->icol;
6009ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
6019ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
6029ae82921SPaul Mullowney 
6039ae82921SPaul Mullowney   PetscFunctionBegin;
60428b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
605b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
606d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
607d460d7bfSJunchao Zhang #else
6089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
6099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
610ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
611d460d7bfSJunchao Zhang #endif
612d460d7bfSJunchao Zhang 
613aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
6149ae82921SPaul Mullowney 
615d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
616e057df02SPaul Mullowney   /* lower triangular indices */
6179566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
618da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
619da79fbbcSStefano Zampini     const PetscInt *r;
620da79fbbcSStefano Zampini 
6219566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
622aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
623aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
6249566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6259566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
626da79fbbcSStefano Zampini   }
6279ae82921SPaul Mullowney 
628e057df02SPaul Mullowney   /* upper triangular indices */
629c9e33d71SJunchao Zhang   PetscCall(ISIdentity(isicol, &col_identity));
630da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
631da79fbbcSStefano Zampini     const PetscInt *c;
632da79fbbcSStefano Zampini 
633c9e33d71SJunchao Zhang     PetscCall(ISGetIndices(isicol, &c));
634aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
635aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
636c9e33d71SJunchao Zhang     PetscCall(ISRestoreIndices(isicol, &c));
6379566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
638da79fbbcSStefano Zampini   }
6393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6409ae82921SPaul Mullowney }
6419ae82921SPaul Mullowney 
642b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
643d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
644d460d7bfSJunchao Zhang {
645d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
646d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
647d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
648d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
649d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
650d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
651d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
652d460d7bfSJunchao Zhang 
653d460d7bfSJunchao Zhang   PetscFunctionBegin;
654d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
655d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
656d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
657d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
658d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
659d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
660d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
661d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
662d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
663d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
664d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
665d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
666d460d7bfSJunchao Zhang       }
667d460d7bfSJunchao Zhang       // Copy M (U) from host to device
668f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
669f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
670f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
671f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
672d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
673d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
674d460d7bfSJunchao Zhang 
675d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
676d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
677d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
678d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
679d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
680d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
681d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
682d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
683d460d7bfSJunchao Zhang 
684d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
685d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
686d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
687d460d7bfSJunchao Zhang 
688d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
689f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
690f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
691d460d7bfSJunchao Zhang 
692d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
693d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
694d460d7bfSJunchao Zhang 
695d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
696d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
697d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
698d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
699d460d7bfSJunchao Zhang 
700aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
701d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
702d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
703d460d7bfSJunchao Zhang 
704d460d7bfSJunchao Zhang       // Record for reuse
705d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
706d460d7bfSJunchao Zhang       fs->diag_h   = D;
707d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
708d460d7bfSJunchao Zhang     }
709d460d7bfSJunchao Zhang     // Copy the value
710d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
711d460d7bfSJunchao Zhang     D   = fs->diag_h;
712d460d7bfSJunchao Zhang     Mnz = Ai[m];
713d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
714d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
715d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
716d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
717d460d7bfSJunchao Zhang     }
718d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
719d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
720d460d7bfSJunchao Zhang 
721204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
722204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) {
723204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
725204a0e31SJunchao Zhang     } else
726204a0e31SJunchao Zhang   #endif
727204a0e31SJunchao Zhang     {
728d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
729d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
730d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
731204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis = PETSC_TRUE;
732204a0e31SJunchao Zhang     }
733d460d7bfSJunchao Zhang   }
734d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
735d460d7bfSJunchao Zhang }
736d460d7bfSJunchao Zhang 
737d460d7bfSJunchao Zhang // Solve Ut D U x = b
738d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
739d460d7bfSJunchao Zhang {
740d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
741d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
742d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
743d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
744d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
745d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
746d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
747d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
748d460d7bfSJunchao Zhang 
749d460d7bfSJunchao Zhang   PetscFunctionBegin;
750d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
751d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
752d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
753d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
754d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
755d460d7bfSJunchao Zhang 
756d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
757d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
758d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
759d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
760d460d7bfSJunchao Zhang   } else {
761d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
762d460d7bfSJunchao Zhang   }
763d460d7bfSJunchao Zhang 
764d460d7bfSJunchao Zhang   // Solve Ut Y = X
765d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
766d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
767d460d7bfSJunchao Zhang 
768d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
769d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
770d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
771d460d7bfSJunchao Zhang 
772d460d7bfSJunchao Zhang   // Solve U X = Y
773d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
774d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
775d460d7bfSJunchao Zhang   } else {
776d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
777d460d7bfSJunchao Zhang   }
778d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
779d460d7bfSJunchao Zhang 
780d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
781d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
782d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
783d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
784d460d7bfSJunchao Zhang   }
785d460d7bfSJunchao Zhang 
786d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
787d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
788d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
789d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
790d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
791d460d7bfSJunchao Zhang }
792d460d7bfSJunchao Zhang #else
793d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
794d71ae5a4SJacob Faibussowitsch {
795087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
796087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
797aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
798aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
799087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
800087f3262SPaul Mullowney   PetscScalar                       *AAUp;
801087f3262SPaul Mullowney   PetscScalar                       *AALo;
802087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
803087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
804087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
805087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
806087f3262SPaul Mullowney 
807087f3262SPaul Mullowney   PetscFunctionBegin;
8083ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
809c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
810087f3262SPaul Mullowney     try {
8119566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
8129566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
813da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
814087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
8159566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
8169566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
817087f3262SPaul Mullowney 
818087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
819087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
820087f3262SPaul Mullowney         AiUp[n] = nzUpper;
821087f3262SPaul Mullowney         offset  = 0;
822087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
823087f3262SPaul Mullowney           /* set the pointers */
824087f3262SPaul Mullowney           v  = aa + ai[i];
825087f3262SPaul Mullowney           vj = aj + ai[i];
826087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
827087f3262SPaul Mullowney 
828087f3262SPaul Mullowney           /* first, set the diagonal elements */
829087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
83009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
831087f3262SPaul Mullowney           AiUp[i]      = offset;
83209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
833087f3262SPaul Mullowney 
834087f3262SPaul Mullowney           offset += 1;
835087f3262SPaul Mullowney           if (nz > 0) {
836f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
837f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
838087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
839087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
840087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
841087f3262SPaul Mullowney             }
842087f3262SPaul Mullowney             offset += nz;
843087f3262SPaul Mullowney           }
844087f3262SPaul Mullowney         }
845087f3262SPaul Mullowney 
846aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8479566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
848da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
849087f3262SPaul Mullowney 
850aa372e3fSPaul Mullowney         /* Create the matrix description */
8519566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8531b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
855afb2bd1cSJunchao Zhang   #else
8569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
857afb2bd1cSJunchao Zhang   #endif
8589566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8599566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
860087f3262SPaul Mullowney 
861aa372e3fSPaul Mullowney         /* set the matrix */
862aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
863aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
864aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
865aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
866aa372e3fSPaul Mullowney 
867aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
868aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
869aa372e3fSPaul Mullowney 
870aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
871aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
872aa372e3fSPaul Mullowney 
873aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
874aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
875aa372e3fSPaul Mullowney 
876afb2bd1cSJunchao Zhang         /* set the operation */
877afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
878afb2bd1cSJunchao Zhang 
879afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8809566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
881261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8821b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8839371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8849371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
886afb2bd1cSJunchao Zhang   #endif
887afb2bd1cSJunchao Zhang 
888aa372e3fSPaul Mullowney         /* perform the solve analysis */
8899371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8909f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8919f7ba44dSJacob Faibussowitsch 
8929566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8939566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
894aa372e3fSPaul Mullowney 
895da79fbbcSStefano Zampini         /* assign the pointer */
896aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
897aa372e3fSPaul Mullowney 
898aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8999566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
900da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney         /* Create the matrix description */
9039566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
9049566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
9051b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
907afb2bd1cSJunchao Zhang   #else
9089566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
909afb2bd1cSJunchao Zhang   #endif
9109566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
9119566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
912aa372e3fSPaul Mullowney 
913aa372e3fSPaul Mullowney         /* set the operation */
914aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
915aa372e3fSPaul Mullowney 
916aa372e3fSPaul Mullowney         /* set the matrix */
917aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
918aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
919aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
920aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
921aa372e3fSPaul Mullowney 
922aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
923aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
924aa372e3fSPaul Mullowney 
925aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
926aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
927aa372e3fSPaul Mullowney 
928aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
929aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
930aa372e3fSPaul Mullowney 
931afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9329566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
933261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9341b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9359371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9369371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9379566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
938afb2bd1cSJunchao Zhang   #endif
939afb2bd1cSJunchao Zhang 
940aa372e3fSPaul Mullowney         /* perform the solve analysis */
9419371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9429f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9439f7ba44dSJacob Faibussowitsch 
9449566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9459566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
946aa372e3fSPaul Mullowney 
947da79fbbcSStefano Zampini         /* assign the pointer */
948aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
949087f3262SPaul Mullowney 
9509566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9519566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
953da79fbbcSStefano Zampini       } else {
954da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
955da79fbbcSStefano Zampini         offset = 0;
956da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
957da79fbbcSStefano Zampini           /* set the pointers */
958da79fbbcSStefano Zampini           v  = aa + ai[i];
959da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
960da79fbbcSStefano Zampini 
961da79fbbcSStefano Zampini           /* first, set the diagonal elements */
962da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
963da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
964da79fbbcSStefano Zampini 
965da79fbbcSStefano Zampini           offset += 1;
966da79fbbcSStefano Zampini           if (nz > 0) {
967f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
968da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
969da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
970da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
971da79fbbcSStefano Zampini             }
972da79fbbcSStefano Zampini             offset += nz;
973da79fbbcSStefano Zampini           }
974da79fbbcSStefano Zampini         }
97528b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
97628b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
977da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
978da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9799566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
980da79fbbcSStefano Zampini       }
9819566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9829566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
983d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
984d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
985d71ae5a4SJacob Faibussowitsch     }
986087f3262SPaul Mullowney   }
9873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
988087f3262SPaul Mullowney }
989d460d7bfSJunchao Zhang #endif
990087f3262SPaul Mullowney 
991d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
992d71ae5a4SJacob Faibussowitsch {
993087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
994087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
995087f3262SPaul Mullowney   IS                            ip                 = a->row;
996087f3262SPaul Mullowney   PetscBool                     perm_identity;
997087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
998087f3262SPaul Mullowney 
999087f3262SPaul Mullowney   PetscFunctionBegin;
100028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
1001d460d7bfSJunchao Zhang 
1002b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1003d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1004d460d7bfSJunchao Zhang #else
10059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1006ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1007d460d7bfSJunchao Zhang #endif
1008aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1009aa372e3fSPaul Mullowney 
1010da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
1011da79fbbcSStefano Zampini 
1012087f3262SPaul Mullowney   /* lower triangular indices */
10139566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1014087f3262SPaul Mullowney   if (!perm_identity) {
10154e4bbfaaSStefano Zampini     IS              iip;
1016da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
10174e4bbfaaSStefano Zampini 
10189566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
10199566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
10209566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
1021aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1022aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1023aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10244e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10259566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10269566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10279566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10289566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1029da79fbbcSStefano Zampini   }
10303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1031087f3262SPaul Mullowney }
1032087f3262SPaul Mullowney 
1033d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1034d71ae5a4SJacob Faibussowitsch {
1035087f3262SPaul Mullowney   PetscFunctionBegin;
10369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10379566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1038ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1039d460d7bfSJunchao Zhang 
1040b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1041d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1043d460d7bfSJunchao Zhang #else
1044087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1045d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1046d460d7bfSJunchao Zhang   IS          ip = b->row;
1047d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1048d460d7bfSJunchao Zhang 
10499566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1050087f3262SPaul Mullowney   if (perm_identity) {
1051087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1052087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1053087f3262SPaul Mullowney   } else {
1054087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1055087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1056d460d7bfSJunchao Zhang   }
1057d460d7bfSJunchao Zhang #endif
10584e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10594e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1060087f3262SPaul Mullowney 
1061087f3262SPaul Mullowney   /* get the triangular factors */
10629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1064087f3262SPaul Mullowney }
10659ae82921SPaul Mullowney 
1066b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1067d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1068d71ae5a4SJacob Faibussowitsch {
1069bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1070aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1071aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1072da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1073da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1074aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1075aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1076aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1077aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1078b175d8bbSPaul Mullowney 
1079bda325fcSPaul Mullowney   PetscFunctionBegin;
1080aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10819566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1082da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1083aa372e3fSPaul Mullowney 
1084aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1085aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1086aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10879371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1088aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1089aa372e3fSPaul Mullowney 
1090aa372e3fSPaul Mullowney   /* Create the matrix description */
10919566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10929566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10939566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10949566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10959566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1096aa372e3fSPaul Mullowney 
1097aa372e3fSPaul Mullowney   /* set the operation */
1098aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1099aa372e3fSPaul Mullowney 
1100aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1101aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1102afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1103afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1104aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1105afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1106afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1107afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1108aa372e3fSPaul Mullowney 
1109aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1110afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11119371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
11129371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
11139371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
11149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1115afb2bd1cSJunchao Zhang   #endif
1116afb2bd1cSJunchao Zhang 
11179566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11189f7ba44dSJacob Faibussowitsch   {
11199f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11209f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
11219371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1122afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11239f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1124afb2bd1cSJunchao Zhang   #else
11259f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1126afb2bd1cSJunchao Zhang   #endif
11279f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11289f7ba44dSJacob Faibussowitsch   }
11299f7ba44dSJacob Faibussowitsch 
11309566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11319566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1132aa372e3fSPaul Mullowney 
1133afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11349566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1135261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11361b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11379371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11389371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11399566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1140afb2bd1cSJunchao Zhang   #endif
1141afb2bd1cSJunchao Zhang 
1142afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11439371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11449f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11459f7ba44dSJacob Faibussowitsch 
11469566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11479566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1148aa372e3fSPaul Mullowney 
1149da79fbbcSStefano Zampini   /* assign the pointer */
1150aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1151aa372e3fSPaul Mullowney 
1152aa372e3fSPaul Mullowney   /*********************************************/
1153aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1154aa372e3fSPaul Mullowney   /*********************************************/
1155aa372e3fSPaul Mullowney 
1156aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11579566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1158da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1159aa372e3fSPaul Mullowney 
1160aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1161aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1162aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11639371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1164aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1165aa372e3fSPaul Mullowney 
1166aa372e3fSPaul Mullowney   /* Create the matrix description */
11679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11699566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11709566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1172aa372e3fSPaul Mullowney 
1173aa372e3fSPaul Mullowney   /* set the operation */
1174aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1175aa372e3fSPaul Mullowney 
1176aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1177aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1178afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1179afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1180aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1181afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1182afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1183afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1184aa372e3fSPaul Mullowney 
1185aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1186afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11879371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11889371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11899371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11909566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1191afb2bd1cSJunchao Zhang   #endif
1192afb2bd1cSJunchao Zhang 
11939566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11949f7ba44dSJacob Faibussowitsch   {
11959f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11969f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11979371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1198afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11999f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1200afb2bd1cSJunchao Zhang   #else
12019f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1202afb2bd1cSJunchao Zhang   #endif
12039f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
12049f7ba44dSJacob Faibussowitsch   }
1205d49cd2b7SBarry Smith 
12069566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12079566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1208aa372e3fSPaul Mullowney 
1209afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
12109566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1211261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
12121b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12139371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12149371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
12159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1216afb2bd1cSJunchao Zhang   #endif
1217afb2bd1cSJunchao Zhang 
1218afb2bd1cSJunchao Zhang   /* perform the solve analysis */
12195f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12209371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12219f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1222d49cd2b7SBarry Smith 
12239566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12249566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1225aa372e3fSPaul Mullowney 
1226da79fbbcSStefano Zampini   /* assign the pointer */
1227aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1229bda325fcSPaul Mullowney }
1230d460d7bfSJunchao Zhang #endif
1231bda325fcSPaul Mullowney 
12329371c9d4SSatish Balay struct PetscScalarToPetscInt {
12339371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1234a49f1ed0SStefano Zampini };
1235a49f1ed0SStefano Zampini 
1236d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1237d71ae5a4SJacob Faibussowitsch {
1238aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1239a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1240bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1241bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1242aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1243b175d8bbSPaul Mullowney 
1244bda325fcSPaul Mullowney   PetscFunctionBegin;
12459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1246a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
124728b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1248a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
124908401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12503ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12519566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
125348a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1254a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1255aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1257aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12589566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12599566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1260aa372e3fSPaul Mullowney 
1261b06137fdSPaul Mullowney     /* set alpha and beta */
1262f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1263f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1264f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1268b06137fdSPaul Mullowney 
1269aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1270aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1271a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1272554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1273554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1274aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1275a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1276aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1277aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1278a3fdcf43SKarl Rupp 
1279ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
128081902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1281afb2bd1cSJunchao Zhang 
1282afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12833606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12849371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12859371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12869371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12873606e59fSJunchao Zhang   #else
12883606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12893606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12903606e59fSJunchao Zhang 
12913606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12923606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12933606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12943606e59fSJunchao Zhang         */
12953606e59fSJunchao Zhang       if (matrixT->num_entries) {
12969371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12979371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12983606e59fSJunchao Zhang 
12993606e59fSJunchao Zhang       } else {
13003606e59fSJunchao Zhang         matstructT->matDescr = NULL;
13013606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13023606e59fSJunchao Zhang       }
13033606e59fSJunchao Zhang   #endif
1304afb2bd1cSJunchao Zhang #endif
1305aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1306afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1307afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1308afb2bd1cSJunchao Zhang #else
1309aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
131051c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
131151c6d536SStefano Zampini       /* First convert HYB to CSR */
1312aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1313aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1314aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1315aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1316aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1317aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1318aa372e3fSPaul Mullowney 
13199371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
13209371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1321aa372e3fSPaul Mullowney 
1322aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1323aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1324aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1325aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1326aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1327aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1328aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1329aa372e3fSPaul Mullowney 
13309371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13319371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13329371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1333aa372e3fSPaul Mullowney 
1334aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1335aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13369566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13379371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13389371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13399371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1340aa372e3fSPaul Mullowney 
1341aa372e3fSPaul Mullowney       /* assign the pointer */
1342aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13431a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1344aa372e3fSPaul Mullowney       /* delete temporaries */
1345aa372e3fSPaul Mullowney       if (tempT) {
1346aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1347aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1348aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1349aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1350087f3262SPaul Mullowney       }
1351aa372e3fSPaul Mullowney       if (temp) {
1352aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1353aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1354aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1355aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1356aa372e3fSPaul Mullowney       }
1357afb2bd1cSJunchao Zhang #endif
1358aa372e3fSPaul Mullowney     }
1359a49f1ed0SStefano Zampini   }
1360a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1361a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1362a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
136328b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
136428b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
136528b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
136628b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
136728b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
136828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
136928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
137028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1371a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1372a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1373a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13749566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1375a49f1ed0SStefano Zampini     }
1376a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1377a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1378792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1379a49f1ed0SStefano Zampini 
1380a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1381a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1382a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1383a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13849371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13859371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13869371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13879566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1388a49f1ed0SStefano Zampini #endif
1389a49f1ed0SStefano Zampini 
13901a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13911a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13921a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13931a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13941a2c6b5cSJunchao Zhang 
13951a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13961a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13971a2c6b5cSJunchao Zhang         */
13989371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1399a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
14009371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
14019371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1402a49f1ed0SStefano Zampini #else
14039371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
14049371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1405a49f1ed0SStefano Zampini #endif
14061a2c6b5cSJunchao Zhang       } else {
14071a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
14081a2c6b5cSJunchao Zhang       }
14091a2c6b5cSJunchao Zhang 
1410a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1411792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1412a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
14139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1414a49f1ed0SStefano Zampini #endif
1415a49f1ed0SStefano Zampini     }
14169371c9d4SSatish Balay     PetscCallThrust(
14179371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1418a49f1ed0SStefano Zampini   }
14199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14209566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1421213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1422213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1423aa372e3fSPaul Mullowney   /* assign the pointer */
1424aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14251a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1427bda325fcSPaul Mullowney }
1428bda325fcSPaul Mullowney 
1429b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1430d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1431d460d7bfSJunchao Zhang {
1432d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1433d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1434d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1435d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1436d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1437d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1438d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1440d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1441d460d7bfSJunchao Zhang 
1442d460d7bfSJunchao Zhang   PetscFunctionBegin;
1443d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1444d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1445d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1446d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1447d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1448d460d7bfSJunchao Zhang 
1449d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1450d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1451d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1452d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1453d460d7bfSJunchao Zhang   } else {
1454d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1455d460d7bfSJunchao Zhang   }
1456d460d7bfSJunchao Zhang 
1457d460d7bfSJunchao Zhang   // Solve L Y = X
1458d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1459d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1460d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1461d460d7bfSJunchao Zhang 
1462d460d7bfSJunchao Zhang   // Solve U X = Y
1463d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1464d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1465d460d7bfSJunchao Zhang   } else {
1466d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1467d460d7bfSJunchao Zhang   }
1468d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1469d460d7bfSJunchao Zhang 
1470d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1471d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1472d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1473d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1474d460d7bfSJunchao Zhang   }
1475d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1476d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1477d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1478d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1479d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1480d460d7bfSJunchao Zhang }
1481d460d7bfSJunchao Zhang 
1482d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1483d460d7bfSJunchao Zhang {
1484d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1485d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1486d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1487d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1488d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1489d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1490d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1491d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1492d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1493d460d7bfSJunchao Zhang 
1494d460d7bfSJunchao Zhang   PetscFunctionBegin;
1495d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1496d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1497d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1498d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1499d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1500d460d7bfSJunchao Zhang 
1501d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1502d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1503d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1504d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1505d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1506d460d7bfSJunchao Zhang   }
1507d460d7bfSJunchao Zhang 
1508d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1509d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1510d460d7bfSJunchao Zhang 
1511d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1512d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1513d460d7bfSJunchao Zhang   }
1514d460d7bfSJunchao Zhang 
1515d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1516d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1517d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1518d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1519d460d7bfSJunchao Zhang 
1520d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1521d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1522d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1523d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1524d460d7bfSJunchao Zhang   } else {
1525d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1526d460d7bfSJunchao Zhang   }
1527d460d7bfSJunchao Zhang 
1528d460d7bfSJunchao Zhang   // Solve Ut Y = X
1529d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1530d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1531d460d7bfSJunchao Zhang 
1532d460d7bfSJunchao Zhang   // Solve Lt X = Y
1533d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1534d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1535d460d7bfSJunchao Zhang   } else {
1536d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1537d460d7bfSJunchao Zhang   }
1538d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1539d460d7bfSJunchao Zhang 
1540d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1541d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1542d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1543d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1544d460d7bfSJunchao Zhang   }
1545d460d7bfSJunchao Zhang 
1546d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1547d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1548d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1549d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1550d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1551d460d7bfSJunchao Zhang }
1552d460d7bfSJunchao Zhang #else
1553a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1554d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1555d71ae5a4SJacob Faibussowitsch {
1556c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1557465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1558465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1559465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1560465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1561bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1562aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1563aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1564aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1565bda325fcSPaul Mullowney 
1566bda325fcSPaul Mullowney   PetscFunctionBegin;
1567aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1568aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1570aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1571aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1572bda325fcSPaul Mullowney   }
1573bda325fcSPaul Mullowney 
1574bda325fcSPaul Mullowney   /* Get the GPU pointers */
15759566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15769566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1577c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1578c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1579bda325fcSPaul Mullowney 
15809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1581aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15829371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1583aa372e3fSPaul Mullowney 
1584aa372e3fSPaul Mullowney   /* First, solve U */
15859f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15869f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1587aa372e3fSPaul Mullowney 
1588aa372e3fSPaul Mullowney   /* Then, solve L */
15899f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15909f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1591aa372e3fSPaul Mullowney 
1592aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15939371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1594aa372e3fSPaul Mullowney 
1595aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1596a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1597bda325fcSPaul Mullowney 
1598bda325fcSPaul Mullowney   /* restore */
15999566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16009566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1604bda325fcSPaul Mullowney }
1605bda325fcSPaul Mullowney 
1606d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1607d71ae5a4SJacob Faibussowitsch {
1608465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1609465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1610bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1611aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1612aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1613aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1614bda325fcSPaul Mullowney 
1615bda325fcSPaul Mullowney   PetscFunctionBegin;
1616aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1617aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
16189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1619aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1620aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1621bda325fcSPaul Mullowney   }
1622bda325fcSPaul Mullowney 
1623bda325fcSPaul Mullowney   /* Get the GPU pointers */
16249566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16259566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1626bda325fcSPaul Mullowney 
16279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1628aa372e3fSPaul Mullowney   /* First, solve U */
16299f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16309f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1631aa372e3fSPaul Mullowney 
1632aa372e3fSPaul Mullowney   /* Then, solve L */
16339f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16349f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1635bda325fcSPaul Mullowney 
1636bda325fcSPaul Mullowney   /* restore */
16379566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16389566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1642bda325fcSPaul Mullowney }
1643bda325fcSPaul Mullowney 
1644d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1645d71ae5a4SJacob Faibussowitsch {
1646465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1647465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1648465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1649465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16509ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1651aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1652aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1653aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16549ae82921SPaul Mullowney 
16559ae82921SPaul Mullowney   PetscFunctionBegin;
1656e057df02SPaul Mullowney   /* Get the GPU pointers */
16579566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16589566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1659c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1660c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16619ae82921SPaul Mullowney 
16629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1663aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16649371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1665aa372e3fSPaul Mullowney 
1666aa372e3fSPaul Mullowney   /* Next, solve L */
16679f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16689f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1669aa372e3fSPaul Mullowney 
1670aa372e3fSPaul Mullowney   /* Then, solve U */
16719f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16729f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1673d49cd2b7SBarry Smith 
16744e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16759371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16769ae82921SPaul Mullowney 
16779566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16789566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16829ae82921SPaul Mullowney }
16839ae82921SPaul Mullowney 
1684d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1685d71ae5a4SJacob Faibussowitsch {
1686465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1687465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16889ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1689aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1690aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1691aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16929ae82921SPaul Mullowney 
16939ae82921SPaul Mullowney   PetscFunctionBegin;
1694e057df02SPaul Mullowney   /* Get the GPU pointers */
16959566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16969566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16979ae82921SPaul Mullowney 
16989566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1699aa372e3fSPaul Mullowney   /* First, solve L */
17009f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
17019f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1702d49cd2b7SBarry Smith 
1703aa372e3fSPaul Mullowney   /* Next, solve U */
17049f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
17059f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
17069ae82921SPaul Mullowney 
17079566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
17089566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
17099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
17113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17129ae82921SPaul Mullowney }
1713d460d7bfSJunchao Zhang #endif
17149ae82921SPaul Mullowney 
1715b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
17168eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1717d71ae5a4SJacob Faibussowitsch {
1718da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1719da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1720da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1721da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1722da112707SJunchao Zhang   PetscInt                      m, nz;
1723da112707SJunchao Zhang   PetscBool                     flg;
1724da112707SJunchao Zhang 
1725da112707SJunchao Zhang   PetscFunctionBegin;
1726da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1727da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1728da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1729da112707SJunchao Zhang   }
1730da112707SJunchao Zhang 
1731da112707SJunchao Zhang   /* Copy A's value to fact */
1732da112707SJunchao Zhang   m  = fact->rmap->n;
1733da112707SJunchao Zhang   nz = aij->nz;
1734da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1735da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1736da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1737da112707SJunchao Zhang 
1738bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1739da112707SJunchao Zhang   /* Factorize fact inplace */
17409371c9d4SSatish Balay   if (m)
17419371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1742d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1743da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1744da112707SJunchao Zhang     int              numerical_zero;
1745da112707SJunchao Zhang     cusparseStatus_t status;
1746da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1747da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1748da112707SJunchao Zhang   }
1749da112707SJunchao Zhang 
1750204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1751204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1752204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1754204a0e31SJunchao Zhang   } else
1755204a0e31SJunchao Zhang   #endif
1756204a0e31SJunchao Zhang   {
175712ba2bc6SJunchao Zhang     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
175812ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
175912ba2bc6SJunchao Zhang     */
17609371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1761da112707SJunchao Zhang 
17629371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1763da112707SJunchao Zhang 
1764204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
176512ba2bc6SJunchao Zhang     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
176612ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1767204a0e31SJunchao Zhang   }
176812ba2bc6SJunchao Zhang 
1769da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1770d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1771d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1772da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1773da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1774bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1775da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1777da112707SJunchao Zhang }
1778da112707SJunchao Zhang 
17798eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1780d71ae5a4SJacob Faibussowitsch {
1781da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1782da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1783da112707SJunchao Zhang   PetscInt                      m, nz;
1784da112707SJunchao Zhang 
1785da112707SJunchao Zhang   PetscFunctionBegin;
1786da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1787da112707SJunchao Zhang     PetscInt  i;
1788da112707SJunchao Zhang     PetscBool flg, missing;
1789da112707SJunchao Zhang 
1790da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1791da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1792da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1793da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1794da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1795da112707SJunchao Zhang   }
1796da112707SJunchao Zhang 
1797da112707SJunchao Zhang   /* Free the old stale stuff */
1798da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1799da112707SJunchao Zhang 
1800da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1801da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1802da112707SJunchao Zhang    */
1803da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1804da112707SJunchao Zhang 
1805da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1806da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1807da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1808da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1809da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1810da112707SJunchao Zhang 
1811da112707SJunchao Zhang   aij->row = NULL;
1812da112707SJunchao Zhang   aij->col = NULL;
1813da112707SJunchao Zhang 
1814da112707SJunchao Zhang   /* ====================================================================== */
1815da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1816da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1817da112707SJunchao Zhang   /* ====================================================================== */
1818da112707SJunchao Zhang   const int *Ai, *Aj;
1819da112707SJunchao Zhang 
1820da112707SJunchao Zhang   m  = fact->rmap->n;
1821da112707SJunchao Zhang   nz = aij->nz;
1822da112707SJunchao Zhang 
1823f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1824f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1825f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1826d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1827d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1828d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1829da112707SJunchao Zhang 
1830da112707SJunchao Zhang   /* ====================================================================== */
1831da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1832da112707SJunchao Zhang   /* ====================================================================== */
1833da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1834da112707SJunchao Zhang   cusparseDiagType_t diagType;
1835da112707SJunchao Zhang 
1836da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1837da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1838da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1839da112707SJunchao Zhang 
1840da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1841da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1842da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1843da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1844da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1845da112707SJunchao Zhang   */
1846da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1847da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1848d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18499371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18509371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1851da112707SJunchao Zhang 
1852da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1853da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1854d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18559371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18569371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1857da112707SJunchao Zhang 
1858da112707SJunchao Zhang   /* ========================================================================= */
1859da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1860da112707SJunchao Zhang   /* ========================================================================= */
1861da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18629371c9d4SSatish Balay   if (m)
18639371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1864d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1865da112707SJunchao Zhang 
1866da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1867da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1868da112707SJunchao Zhang 
1869da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1870da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1871da112707SJunchao Zhang 
1872da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1874da112707SJunchao Zhang 
1875da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18769371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1877da112707SJunchao Zhang 
1878da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
187912ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
188012ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
188112ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1882da112707SJunchao Zhang    */
188312ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
188412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
188512ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1886da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
188712ba2bc6SJunchao Zhang   } else {
188812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
188912ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1890da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
189112ba2bc6SJunchao Zhang   }
1892da112707SJunchao Zhang 
1893da112707SJunchao Zhang   /* ========================================================================== */
1894da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1895da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1896da112707SJunchao Zhang   /* ========================================================================== */
1897da112707SJunchao Zhang   int              structural_zero;
1898da112707SJunchao Zhang   cusparseStatus_t status;
1899da112707SJunchao Zhang 
1900da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
19019371c9d4SSatish Balay   if (m)
19029371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1903d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1904da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1905da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1906da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1907da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1908da112707SJunchao Zhang   }
1909da112707SJunchao Zhang 
1910da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
19110dd8c0acSJunchao Zhang   {
1912da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
19130dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1914da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1915da112707SJunchao Zhang 
1916da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1917da112707SJunchao Zhang     Ai    = Aseq->i;
1918da112707SJunchao Zhang     Adiag = Aseq->diag;
1919da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1920da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1921da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1922da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1923da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1924da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1925da112707SJunchao Zhang         */
1926da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1927da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1928da112707SJunchao Zhang       }
1929da112707SJunchao Zhang     }
1930da112707SJunchao Zhang     fs->numericFactFlops = flops;
19310dd8c0acSJunchao Zhang   }
1932da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1934da112707SJunchao Zhang }
1935da112707SJunchao Zhang 
1936d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1937d71ae5a4SJacob Faibussowitsch {
1938da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1940da112707SJunchao Zhang   const PetscScalar            *barray;
1941da112707SJunchao Zhang   PetscScalar                  *xarray;
1942da112707SJunchao Zhang 
1943da112707SJunchao Zhang   PetscFunctionBegin;
1944da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1945da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1946da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1947da112707SJunchao Zhang 
1948da112707SJunchao Zhang   /* Solve L*y = b */
1949da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1950da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19519371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19529371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1953da112707SJunchao Zhang 
1954da112707SJunchao Zhang   /* Solve Lt*x = y */
1955da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19569371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19579371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1958da112707SJunchao Zhang 
1959da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1960da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1961da112707SJunchao Zhang 
1962da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1963da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1965da112707SJunchao Zhang }
1966da112707SJunchao Zhang 
19678eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1968d71ae5a4SJacob Faibussowitsch {
1969da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1970da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1971da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1972da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1973da112707SJunchao Zhang   PetscInt                      m, nz;
1974da112707SJunchao Zhang   PetscBool                     flg;
1975da112707SJunchao Zhang 
1976da112707SJunchao Zhang   PetscFunctionBegin;
1977da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1978da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1979da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1980da112707SJunchao Zhang   }
1981da112707SJunchao Zhang 
1982da112707SJunchao Zhang   /* Copy A's value to fact */
1983da112707SJunchao Zhang   m  = fact->rmap->n;
1984da112707SJunchao Zhang   nz = aij->nz;
1985da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1986da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1987da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1988da112707SJunchao Zhang 
1989da112707SJunchao Zhang   /* Factorize fact inplace */
1990da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1991da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1992da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1993da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1994da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1995da112707SJunchao Zhang    */
1996d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1997da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1998da112707SJunchao Zhang     int              numerical_zero;
1999da112707SJunchao Zhang     cusparseStatus_t status;
2000da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2001da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2002da112707SJunchao Zhang   }
2003da112707SJunchao Zhang 
2004204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2005204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
2006204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2008204a0e31SJunchao Zhang   } else
2009204a0e31SJunchao Zhang   #endif
2010204a0e31SJunchao Zhang   {
20119371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2012da112707SJunchao Zhang 
2013da112707SJunchao Zhang     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2014da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2015da112707SJunchao Zhang   */
20169371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2017204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
2018204a0e31SJunchao Zhang   }
2019da112707SJunchao Zhang 
2020da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2021da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2022da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2023da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
2024da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
2025da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
20263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2027da112707SJunchao Zhang }
2028da112707SJunchao Zhang 
20298eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2030d71ae5a4SJacob Faibussowitsch {
2031da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2032da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2033da112707SJunchao Zhang   PetscInt                      m, nz;
2034da112707SJunchao Zhang 
2035da112707SJunchao Zhang   PetscFunctionBegin;
2036da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2037da112707SJunchao Zhang     PetscInt  i;
2038da112707SJunchao Zhang     PetscBool flg, missing;
2039da112707SJunchao Zhang 
2040da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2041da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2042da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2043da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2044da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2045da112707SJunchao Zhang   }
2046da112707SJunchao Zhang 
2047da112707SJunchao Zhang   /* Free the old stale stuff */
2048da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2049da112707SJunchao Zhang 
2050da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2051da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2052da112707SJunchao Zhang    */
2053da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2054da112707SJunchao Zhang 
2055da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2056da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2057da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2058da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2059da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2060da112707SJunchao Zhang 
2061da112707SJunchao Zhang   aij->row = NULL;
2062da112707SJunchao Zhang   aij->col = NULL;
2063da112707SJunchao Zhang 
2064da112707SJunchao Zhang   /* ====================================================================== */
2065da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2066da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2067da112707SJunchao Zhang   /* ====================================================================== */
2068da112707SJunchao Zhang   const int *Ai, *Aj;
2069da112707SJunchao Zhang 
2070da112707SJunchao Zhang   m  = fact->rmap->n;
2071da112707SJunchao Zhang   nz = aij->nz;
2072da112707SJunchao Zhang 
2073f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2074f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2075da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2076da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2077d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2078d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2079da112707SJunchao Zhang 
2080da112707SJunchao Zhang   /* ====================================================================== */
2081da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2082da112707SJunchao Zhang   /* ====================================================================== */
2083da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2084da112707SJunchao Zhang   cusparseDiagType_t diagType;
2085da112707SJunchao Zhang 
2086da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2087da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2088da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2089da112707SJunchao Zhang 
2090da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2091da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2092da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2093da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2094da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2095da112707SJunchao Zhang   */
2096da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2097da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2098d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20999371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
21009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2101da112707SJunchao Zhang 
2102da112707SJunchao Zhang   /* ========================================================================= */
2103da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2104da112707SJunchao Zhang   /* ========================================================================= */
2105da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2106d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2107da112707SJunchao Zhang 
2108da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2109da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2110da112707SJunchao Zhang 
2111da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2112da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2113da112707SJunchao Zhang 
2114da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
21159371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2116da112707SJunchao Zhang 
2117da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
21189371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2119da112707SJunchao Zhang 
212012ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
212112ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
212212ba2bc6SJunchao Zhang    */
212312ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
212412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
212512ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2126da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
212712ba2bc6SJunchao Zhang   } else {
212812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
212912ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
213012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
213112ba2bc6SJunchao Zhang   }
2132da112707SJunchao Zhang 
2133da112707SJunchao Zhang   /* ========================================================================== */
2134da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2135da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2136da112707SJunchao Zhang   /* ========================================================================== */
2137da112707SJunchao Zhang   int              structural_zero;
2138da112707SJunchao Zhang   cusparseStatus_t status;
2139da112707SJunchao Zhang 
2140da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2141d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2142da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2143da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2144da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2145da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2146da112707SJunchao Zhang   }
2147da112707SJunchao Zhang 
2148da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21490dd8c0acSJunchao Zhang   {
2150da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21510dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2152da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2153da112707SJunchao Zhang 
2154da112707SJunchao Zhang     Ai = Aseq->i;
2155da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2156da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2157da112707SJunchao Zhang       if (nzRow > 1) {
2158da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2159da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2160da112707SJunchao Zhang         */
2161da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2162da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2163da112707SJunchao Zhang       }
2164da112707SJunchao Zhang     }
2165da112707SJunchao Zhang     fs->numericFactFlops = flops;
21660dd8c0acSJunchao Zhang   }
2167da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2169da112707SJunchao Zhang }
2170da112707SJunchao Zhang #endif
2171da112707SJunchao Zhang 
2172d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2173d460d7bfSJunchao Zhang {
2174b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2175b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2176d460d7bfSJunchao Zhang 
2177d460d7bfSJunchao Zhang   PetscFunctionBegin;
2178d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2179d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2180d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2181d460d7bfSJunchao Zhang 
2182d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2183b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2184d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2185d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2186d460d7bfSJunchao Zhang #else
2187d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2188d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2189d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2190d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2191d460d7bfSJunchao Zhang 
2192d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2193d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2194d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2195d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2196d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2197d460d7bfSJunchao Zhang     } else {
2198d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2199d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2200d460d7bfSJunchao Zhang     }
2201d460d7bfSJunchao Zhang #endif
2202d460d7bfSJunchao Zhang   }
2203d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2204d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2205d460d7bfSJunchao Zhang 
2206d460d7bfSJunchao Zhang   /* get the triangular factors */
2207d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2208d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2209d460d7bfSJunchao Zhang }
2210d460d7bfSJunchao Zhang 
2211d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2212d460d7bfSJunchao Zhang {
2213d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2214d460d7bfSJunchao Zhang 
2215d460d7bfSJunchao Zhang   PetscFunctionBegin;
2216d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2220d460d7bfSJunchao Zhang }
2221d460d7bfSJunchao Zhang 
2222d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2223d71ae5a4SJacob Faibussowitsch {
2224da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2225da112707SJunchao Zhang 
2226da112707SJunchao Zhang   PetscFunctionBegin;
2227b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2229f82ac72cSJunchao Zhang   if (!info->factoronhost) {
2230da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2231da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2232bc996fdcSJunchao Zhang   }
2233da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2234da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2235da112707SJunchao Zhang   } else
2236da112707SJunchao Zhang #endif
2237da112707SJunchao Zhang   {
2238da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2239da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2240da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2241da112707SJunchao Zhang   }
22423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2243da112707SJunchao Zhang }
2244da112707SJunchao Zhang 
2245d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2246d71ae5a4SJacob Faibussowitsch {
2247da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2248da112707SJunchao Zhang 
2249da112707SJunchao Zhang   PetscFunctionBegin;
2250b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2251bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2252f82ac72cSJunchao Zhang   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2253da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2254da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2255da112707SJunchao Zhang   } else
2256da112707SJunchao Zhang #endif
2257da112707SJunchao Zhang   {
2258da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2259da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2260da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2261da112707SJunchao Zhang   }
22623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2263da112707SJunchao Zhang }
2264da112707SJunchao Zhang 
2265d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2266d71ae5a4SJacob Faibussowitsch {
2267da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2268da112707SJunchao Zhang 
2269da112707SJunchao Zhang   PetscFunctionBegin;
2270da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2271da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2272da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2274da112707SJunchao Zhang }
2275da112707SJunchao Zhang 
227666976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2277d71ae5a4SJacob Faibussowitsch {
2278841d4cb1SJunchao Zhang   PetscFunctionBegin;
2279841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2281841d4cb1SJunchao Zhang }
2282841d4cb1SJunchao Zhang 
2283841d4cb1SJunchao Zhang /*MC
2284841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
228511a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2286841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2287841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
228811a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2289841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2290841d4cb1SJunchao Zhang 
2291841d4cb1SJunchao Zhang   Level: beginner
2292841d4cb1SJunchao Zhang 
22931cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22942ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2295841d4cb1SJunchao Zhang M*/
2296841d4cb1SJunchao Zhang 
2297d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2298d71ae5a4SJacob Faibussowitsch {
2299841d4cb1SJunchao Zhang   PetscInt n = A->rmap->n;
2300841d4cb1SJunchao Zhang 
2301841d4cb1SJunchao Zhang   PetscFunctionBegin;
2302841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2303841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2304b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2305841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2306841d4cb1SJunchao Zhang 
2307841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2308841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2309841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2310841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2311841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2312841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2313841d4cb1SJunchao Zhang     } else {
2314841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2315841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2316841d4cb1SJunchao Zhang     }
2317841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2318841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2319841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2320841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2321841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2322841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2323841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2324841d4cb1SJunchao Zhang     } else {
2325841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2326841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2327841d4cb1SJunchao Zhang     }
2328841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2329841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2330841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2331841d4cb1SJunchao Zhang 
2332841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2333841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2334f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2336841d4cb1SJunchao Zhang }
2337841d4cb1SJunchao Zhang 
2338d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2339d71ae5a4SJacob Faibussowitsch {
23407e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23417e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2342b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2343da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23440dd8c0acSJunchao Zhang #endif
23457e8381f9SStefano Zampini 
23467e8381f9SStefano Zampini   PetscFunctionBegin;
23477e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23489566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2349da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2350da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23519566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2352da112707SJunchao Zhang     }
2353b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2354da112707SJunchao Zhang     else if (fs->csrVal) {
2355da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2356da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2357da112707SJunchao Zhang     }
2358da112707SJunchao Zhang #endif
23599371c9d4SSatish Balay     else
23609371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23619566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23629566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23637e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23647e8381f9SStefano Zampini   }
23653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23667e8381f9SStefano Zampini }
23677e8381f9SStefano Zampini 
2368d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2369d71ae5a4SJacob Faibussowitsch {
23707e8381f9SStefano Zampini   PetscFunctionBegin;
23719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
237267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237467a45760SJunchao Zhang }
237567a45760SJunchao Zhang 
2376d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377d71ae5a4SJacob Faibussowitsch {
237867a45760SJunchao Zhang   PetscFunctionBegin;
23797e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
238067a45760SJunchao Zhang   *array         = NULL;
23813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
238267a45760SJunchao Zhang }
238367a45760SJunchao Zhang 
2384d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2385d71ae5a4SJacob Faibussowitsch {
238667a45760SJunchao Zhang   PetscFunctionBegin;
23879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
238867a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
239067a45760SJunchao Zhang }
239167a45760SJunchao Zhang 
23928eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2393d71ae5a4SJacob Faibussowitsch {
239467a45760SJunchao Zhang   PetscFunctionBegin;
239567a45760SJunchao Zhang   *array = NULL;
23963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
239767a45760SJunchao Zhang }
239867a45760SJunchao Zhang 
2399d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2400d71ae5a4SJacob Faibussowitsch {
240167a45760SJunchao Zhang   PetscFunctionBegin;
240267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
24033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
240467a45760SJunchao Zhang }
240567a45760SJunchao Zhang 
2406d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2407d71ae5a4SJacob Faibussowitsch {
240867a45760SJunchao Zhang   PetscFunctionBegin;
240967a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
241067a45760SJunchao Zhang   *array         = NULL;
24113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24127e8381f9SStefano Zampini }
24137e8381f9SStefano Zampini 
2414d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2415d71ae5a4SJacob Faibussowitsch {
24167ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
24177ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
24187ee59b9bSJunchao Zhang 
24197ee59b9bSJunchao Zhang   PetscFunctionBegin;
24207ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
24217ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
24227ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
24237ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
24247ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
24257ee59b9bSJunchao Zhang 
24267ee59b9bSJunchao Zhang   if (i) {
24277ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24287ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24297ee59b9bSJunchao Zhang #else
24307ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24317ee59b9bSJunchao Zhang #endif
24327ee59b9bSJunchao Zhang   }
24337ee59b9bSJunchao Zhang   if (j) {
24347ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24357ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24367ee59b9bSJunchao Zhang #else
24377ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24387ee59b9bSJunchao Zhang #endif
24397ee59b9bSJunchao Zhang   }
24407ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24417ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24437ee59b9bSJunchao Zhang }
24447ee59b9bSJunchao Zhang 
2445d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2446d71ae5a4SJacob Faibussowitsch {
2447aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24487c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24499ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2450213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2451aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2452abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24539ae82921SPaul Mullowney 
24549ae82921SPaul Mullowney   PetscFunctionBegin;
245528b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2456c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2457a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2458a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2459afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
246085ba7357SStefano Zampini 
246108401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24629566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2463afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24649566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2465f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24669566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24679566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
246834d6c7a5SJose E. Roman     } else {
2469abb89eb1SStefano Zampini       PetscInt nnz;
24709566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24719566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24729566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24737c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
247481902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2475a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2476a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24779ae82921SPaul Mullowney       try {
24789ae82921SPaul Mullowney         if (a->compressedrow.use) {
24799ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24809ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24819ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24829ae82921SPaul Mullowney         } else {
2483213423ffSJunchao Zhang           m    = A->rmap->n;
2484213423ffSJunchao Zhang           ii   = a->i;
2485e6e9a74fSStefano Zampini           ridx = NULL;
24869ae82921SPaul Mullowney         }
248708401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24889371c9d4SSatish Balay         if (!a->a) {
24899371c9d4SSatish Balay           nnz  = ii[m];
24909371c9d4SSatish Balay           both = PETSC_FALSE;
24919371c9d4SSatish Balay         } else nnz = a->nz;
249208401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24939ae82921SPaul Mullowney 
249485ba7357SStefano Zampini         /* create cusparse matrix */
2495abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2496aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24989566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
25009ae82921SPaul Mullowney 
2501f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2502f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2503f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
25049566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
25059566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
25069566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
25079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2508b06137fdSPaul Mullowney 
2509aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2510aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2511aa372e3fSPaul Mullowney           /* set the matrix */
2512afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2513afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2514afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2515abb89eb1SStefano Zampini           mat->num_entries = nnz;
2516ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2517afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
25189ae82921SPaul Mullowney 
2519ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2520abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2521aa372e3fSPaul Mullowney 
2522ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2523abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2524aa372e3fSPaul Mullowney 
2525aa372e3fSPaul Mullowney           /* assign the pointer */
2526afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2527afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2528afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25299371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25309371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25319371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2532afb2bd1cSJunchao Zhang           }
2533afb2bd1cSJunchao Zhang #endif
2534aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2535afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2536afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2537afb2bd1cSJunchao Zhang #else
2538afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2539afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2540afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2541abb89eb1SStefano Zampini           mat->num_entries = nnz;
2542ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2543afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2544aa372e3fSPaul Mullowney 
2545ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2546abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2547aa372e3fSPaul Mullowney 
2548ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2549abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2550aa372e3fSPaul Mullowney 
2551aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25529566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25539371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25549371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25559371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2556aa372e3fSPaul Mullowney           /* assign the pointer */
2557aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2558aa372e3fSPaul Mullowney 
2559afb2bd1cSJunchao Zhang           if (mat) {
2560afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2561afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2562afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2563afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2564087f3262SPaul Mullowney           }
2565afb2bd1cSJunchao Zhang #endif
2566087f3262SPaul Mullowney         }
2567ca45077fSPaul Mullowney 
2568aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2569213423ffSJunchao Zhang         if (a->compressedrow.use) {
2570ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2571ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2572aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2573213423ffSJunchao Zhang           tmp = m;
2574213423ffSJunchao Zhang         } else {
2575213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2576213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2577213423ffSJunchao Zhang           tmp                        = 0;
2578213423ffSJunchao Zhang         }
25799566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2580aa372e3fSPaul Mullowney 
2581aa372e3fSPaul Mullowney         /* assign the pointer */
2582aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2583d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2584d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2585d71ae5a4SJacob Faibussowitsch       }
25869566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25879566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
258834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
258934d6c7a5SJose E. Roman     }
2590abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25919ae82921SPaul Mullowney   }
25923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25939ae82921SPaul Mullowney }
25949ae82921SPaul Mullowney 
25959371c9d4SSatish Balay struct VecCUDAPlusEquals {
2596aa372e3fSPaul Mullowney   template <typename Tuple>
2597d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2598d71ae5a4SJacob Faibussowitsch   {
2599aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2600aa372e3fSPaul Mullowney   }
2601aa372e3fSPaul Mullowney };
2602aa372e3fSPaul Mullowney 
26039371c9d4SSatish Balay struct VecCUDAEquals {
26047e8381f9SStefano Zampini   template <typename Tuple>
2605d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2606d71ae5a4SJacob Faibussowitsch   {
26077e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
26087e8381f9SStefano Zampini   }
26097e8381f9SStefano Zampini };
26107e8381f9SStefano Zampini 
26119371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2612e6e9a74fSStefano Zampini   template <typename Tuple>
2613d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2614d71ae5a4SJacob Faibussowitsch   {
2615e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2616e6e9a74fSStefano Zampini   }
2617e6e9a74fSStefano Zampini };
2618e6e9a74fSStefano Zampini 
2619afb2bd1cSJunchao Zhang struct MatMatCusparse {
2620ccdfe979SStefano Zampini   PetscBool      cisdense;
2621ccdfe979SStefano Zampini   PetscScalar   *Bt;
2622ccdfe979SStefano Zampini   Mat            X;
2623fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2624fcdce8c4SStefano Zampini   PetscLogDouble flops;
2625fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2626b4285af6SJunchao Zhang 
2627afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2628fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2629afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2630afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2631afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2632afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2633b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634b4285af6SJunchao Zhang   void *dBuffer4;
2635b4285af6SJunchao Zhang   void *dBuffer5;
2636b4285af6SJunchao Zhang   #endif
2637fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2638fcdce8c4SStefano Zampini   void                 *mmBuffer;
2639fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2640fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2641afb2bd1cSJunchao Zhang #endif
2642afb2bd1cSJunchao Zhang };
2643ccdfe979SStefano Zampini 
2644d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2645d71ae5a4SJacob Faibussowitsch {
2646ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2647ccdfe979SStefano Zampini 
2648ccdfe979SStefano Zampini   PetscFunctionBegin;
26499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2650fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2651afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26529566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26539566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26549566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26559566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2656b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26579566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26589566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2659b4285af6SJunchao Zhang   #endif
26609566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26619566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2662afb2bd1cSJunchao Zhang #endif
26639566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
26649566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
26653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2666ccdfe979SStefano Zampini }
2667ccdfe979SStefano Zampini 
26684742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2669ccdfe979SStefano Zampini 
2670d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2671d71ae5a4SJacob Faibussowitsch {
2672ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2673ccdfe979SStefano Zampini   Mat                           A, B;
2674afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2675ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2676ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2677ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2678ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2679ccdfe979SStefano Zampini   const PetscScalar            *barray;
2680ccdfe979SStefano Zampini   PetscScalar                  *carray;
2681ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2682ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2683ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2684ccdfe979SStefano Zampini 
2685ccdfe979SStefano Zampini   PetscFunctionBegin;
2686ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
268728b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2688ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2689ccdfe979SStefano Zampini   A      = product->A;
2690ccdfe979SStefano Zampini   B      = product->B;
26919566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
269228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2693ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2694ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
269528b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2697ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2698ccdfe979SStefano Zampini   switch (product->type) {
2699ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2700ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2701ccdfe979SStefano Zampini     mat = cusp->mat;
2702ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2703ccdfe979SStefano Zampini     m   = A->rmap->n;
2704ccdfe979SStefano Zampini     n   = B->cmap->n;
2705ccdfe979SStefano Zampini     break;
2706ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
27071a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2708e6e9a74fSStefano Zampini       mat = cusp->mat;
2709e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2710e6e9a74fSStefano Zampini     } else {
27119566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2712ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2713ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2714e6e9a74fSStefano Zampini     }
2715ccdfe979SStefano Zampini     m = A->cmap->n;
2716ccdfe979SStefano Zampini     n = B->cmap->n;
2717ccdfe979SStefano Zampini     break;
2718ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2719ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2720ccdfe979SStefano Zampini     mat = cusp->mat;
2721ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2722ccdfe979SStefano Zampini     m   = A->rmap->n;
2723ccdfe979SStefano Zampini     n   = B->rmap->n;
2724ccdfe979SStefano Zampini     break;
2725d71ae5a4SJacob Faibussowitsch   default:
2726d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2727ccdfe979SStefano Zampini   }
272828b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2729ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2730ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27319566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27329566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2733cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2734afb2bd1cSJunchao Zhang 
27359566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2736c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2737cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27389566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2739c8378d12SStefano Zampini   } else {
2740cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27419566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2742c8378d12SStefano Zampini   }
2743c8378d12SStefano Zampini 
27449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2745afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2746afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2747fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2748fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2749fe5544b9SJunchao Zhang   #else
2750fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2751fe5544b9SJunchao Zhang   #endif
2752fe5544b9SJunchao Zhang 
2753a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2754afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2755fcdce8c4SStefano Zampini     size_t mmBufferSize;
27569371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27579371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27589371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27599371c9d4SSatish Balay     }
2760afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27619566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2762afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2763afb2bd1cSJunchao Zhang     }
2764c8378d12SStefano Zampini 
27659371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27669371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27679371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27689371c9d4SSatish Balay     }
2769afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27709566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2771afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2772afb2bd1cSJunchao Zhang     }
2773afb2bd1cSJunchao Zhang 
2774fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2775fe5544b9SJunchao Zhang     if (matADescr) {
277617f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2777fe5544b9SJunchao Zhang       matADescr = NULL;
2778fe5544b9SJunchao Zhang     }
2779fe5544b9SJunchao Zhang   #endif
2780fe5544b9SJunchao Zhang 
2781fe5544b9SJunchao Zhang     if (!matADescr) {
2782fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27839371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27849371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2785afb2bd1cSJunchao Zhang     }
2786fe5544b9SJunchao Zhang 
2787fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2788fe5544b9SJunchao Zhang 
2789fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2792fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2793fcdce8c4SStefano Zampini     }
2794fe5544b9SJunchao Zhang 
2795f0b74427SPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2796fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2797fe5544b9SJunchao Zhang   #endif
2798fe5544b9SJunchao Zhang 
2799afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2800afb2bd1cSJunchao Zhang   } else {
2801afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2802fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
28039566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
28049566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2805afb2bd1cSJunchao Zhang   }
2806afb2bd1cSJunchao Zhang 
2807afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2808fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2809afb2bd1cSJunchao Zhang #else
2810afb2bd1cSJunchao Zhang   PetscInt k;
2811afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2812ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2813ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2814ccdfe979SStefano Zampini     cublasStatus_t cerr;
2815ccdfe979SStefano Zampini 
28169566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
28179371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
28189371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2819ccdfe979SStefano Zampini     blda = B->cmap->n;
2820afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2821afb2bd1cSJunchao Zhang   } else {
2822afb2bd1cSJunchao Zhang     k = B->rmap->n;
2823ccdfe979SStefano Zampini   }
2824ccdfe979SStefano Zampini 
2825afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
28269371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
28279371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2828afb2bd1cSJunchao Zhang #endif
28299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2831cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2832ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2833cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28344742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2835ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2836cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28374742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2838ccdfe979SStefano Zampini   } else {
2839cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2840ccdfe979SStefano Zampini   }
284148a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
284248a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2844ccdfe979SStefano Zampini }
2845ccdfe979SStefano Zampini 
2846d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2847d71ae5a4SJacob Faibussowitsch {
2848ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2849ccdfe979SStefano Zampini   Mat                 A, B;
2850ccdfe979SStefano Zampini   PetscInt            m, n;
2851ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2852ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2853ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2854ccdfe979SStefano Zampini 
2855ccdfe979SStefano Zampini   PetscFunctionBegin;
2856ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
285728b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2858ccdfe979SStefano Zampini   A = product->A;
2859ccdfe979SStefano Zampini   B = product->B;
28609566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
286128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2862ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
286308401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2864ccdfe979SStefano Zampini   switch (product->type) {
2865ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2866ccdfe979SStefano Zampini     m = A->rmap->n;
2867ccdfe979SStefano Zampini     n = B->cmap->n;
28680e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2869ccdfe979SStefano Zampini     break;
2870ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2871ccdfe979SStefano Zampini     m = A->cmap->n;
2872ccdfe979SStefano Zampini     n = B->cmap->n;
28730e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
28740e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2875ccdfe979SStefano Zampini     break;
2876ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2877ccdfe979SStefano Zampini     m = A->rmap->n;
2878ccdfe979SStefano Zampini     n = B->rmap->n;
28790e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
28800e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2881ccdfe979SStefano Zampini     break;
2882ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2883ccdfe979SStefano Zampini     m = B->cmap->n;
2884ccdfe979SStefano Zampini     n = B->cmap->n;
28850e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
28860e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2887ccdfe979SStefano Zampini     break;
2888ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2889ccdfe979SStefano Zampini     m = B->rmap->n;
2890ccdfe979SStefano Zampini     n = B->rmap->n;
28910e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
28920e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2893ccdfe979SStefano Zampini     break;
2894d71ae5a4SJacob Faibussowitsch   default:
2895d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2896ccdfe979SStefano Zampini   }
28979566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2898ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28999566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
29009566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2901ccdfe979SStefano Zampini 
2902ccdfe979SStefano Zampini   /* product data */
29039566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2904ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2905afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2906afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
290748a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2908afb2bd1cSJunchao Zhang #endif
2909ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2910ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
29119566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
29129566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2913ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
29149566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2915ccdfe979SStefano Zampini     } else {
29169566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2917ccdfe979SStefano Zampini     }
2918ccdfe979SStefano Zampini   }
2919ccdfe979SStefano Zampini   C->product->data    = mmdata;
2920ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2921ccdfe979SStefano Zampini 
2922ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
29233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2924ccdfe979SStefano Zampini }
2925ccdfe979SStefano Zampini 
2926d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2927d71ae5a4SJacob Faibussowitsch {
2928ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2929fcdce8c4SStefano Zampini   Mat                           A, B;
2930fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2931fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2932fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2933fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2934fcdce8c4SStefano Zampini   PetscBool                     flg;
2935fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2936fcdce8c4SStefano Zampini   MatProductType                ptype;
2937fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2938fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2939fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2940fcdce8c4SStefano Zampini #endif
2941b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2942ccdfe979SStefano Zampini 
2943ccdfe979SStefano Zampini   PetscFunctionBegin;
2944ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
294528b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29469566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
294728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2948fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2949fcdce8c4SStefano Zampini   A      = product->A;
2950fcdce8c4SStefano Zampini   B      = product->B;
2951fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2952fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2953fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
295408401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2955fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
295628b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2957fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
295828b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2959fcdce8c4SStefano Zampini     goto finalize;
2960fcdce8c4SStefano Zampini   }
2961fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29629566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
296328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29649566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
296528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
296628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
296728b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2968fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2969fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2970fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
297108401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
297208401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
297308401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2976fcdce8c4SStefano Zampini 
2977fcdce8c4SStefano Zampini   ptype = product->type;
2978b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2979fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
298028b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2981fa046f9fSJunchao Zhang   }
2982b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2983fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
298428b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2985fa046f9fSJunchao Zhang   }
2986fcdce8c4SStefano Zampini   switch (ptype) {
2987fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2988fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2989fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2990fcdce8c4SStefano Zampini     break;
2991fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2992fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2993fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2994fcdce8c4SStefano Zampini     break;
2995fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2996fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2997fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2998fcdce8c4SStefano Zampini     break;
2999d71ae5a4SJacob Faibussowitsch   default:
3000d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3001fcdce8c4SStefano Zampini   }
3002fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
300328b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
300428b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
300528b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3006fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3007fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3008fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
300928b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
301028b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
301128b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
30129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3013fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3014fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
30159566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3016b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
30179371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30189371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3019b4285af6SJunchao Zhang   #else
30209371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
30219371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30229371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30239371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3024b4285af6SJunchao Zhang   #endif
3025fcdce8c4SStefano Zampini #else
30269371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30279371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30289371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3029fcdce8c4SStefano Zampini #endif
30309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30319566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3033fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3034fcdce8c4SStefano Zampini finalize:
3035fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30369566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30379566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30389566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3039fcdce8c4SStefano Zampini   c->reallocs = 0;
3040fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3041fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3042fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3043fcdce8c4SStefano Zampini   C->num_ass++;
30443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3045ccdfe979SStefano Zampini }
3046fcdce8c4SStefano Zampini 
3047d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3048d71ae5a4SJacob Faibussowitsch {
3049fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3050fcdce8c4SStefano Zampini   Mat                           A, B;
3051fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3052fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3053fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3054fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3055fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3056fcdce8c4SStefano Zampini   PetscBool                     flg;
3057fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3058fcdce8c4SStefano Zampini   MatProductType                ptype;
3059fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3060fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3061fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3062fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3063fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3064fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3065fcdce8c4SStefano Zampini #else
3066fcdce8c4SStefano Zampini   int cnz;
3067fcdce8c4SStefano Zampini #endif
3068b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3069fcdce8c4SStefano Zampini 
3070fcdce8c4SStefano Zampini   PetscFunctionBegin;
3071fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
307228b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3073fcdce8c4SStefano Zampini   A = product->A;
3074fcdce8c4SStefano Zampini   B = product->B;
30759566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
307628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30779566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
307828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3079fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3080fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3081fcdce8c4SStefano Zampini   /* product data */
30829566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3083fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3084fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3085fcdce8c4SStefano Zampini 
30869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3088d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3089d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
309008401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
309108401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3092d60bce21SJunchao Zhang 
3093fcdce8c4SStefano Zampini   ptype = product->type;
3094b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3095fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3096fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3097fa046f9fSJunchao Zhang   }
3098b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3099fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3100fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3101fa046f9fSJunchao Zhang   }
3102fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3103fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3104fcdce8c4SStefano Zampini   switch (ptype) {
3105fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3106fcdce8c4SStefano Zampini     m    = A->rmap->n;
3107fcdce8c4SStefano Zampini     n    = B->cmap->n;
3108fcdce8c4SStefano Zampini     k    = A->cmap->n;
3109fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3110fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3111fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3112fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3113fcdce8c4SStefano Zampini     break;
3114fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3115fcdce8c4SStefano Zampini     m = A->cmap->n;
3116fcdce8c4SStefano Zampini     n = B->cmap->n;
3117fcdce8c4SStefano Zampini     k = A->rmap->n;
31189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3119fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3120fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3121fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3122fcdce8c4SStefano Zampini     break;
3123fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3124fcdce8c4SStefano Zampini     m = A->rmap->n;
3125fcdce8c4SStefano Zampini     n = B->rmap->n;
3126fcdce8c4SStefano Zampini     k = A->cmap->n;
31279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3128fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3129fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3130fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3131fcdce8c4SStefano Zampini     break;
3132d71ae5a4SJacob Faibussowitsch   default:
3133d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3134fcdce8c4SStefano Zampini   }
3135fcdce8c4SStefano Zampini 
3136fcdce8c4SStefano Zampini   /* create cusparse matrix */
31379566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31389566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3139fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3140fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3141fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3142fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3143fcdce8c4SStefano Zampini 
3144fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3145fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3146fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31479566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31489566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3149fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3150fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3151fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3152fcdce8c4SStefano Zampini   } else {
3153fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3154fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3155fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3156fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3157fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3158fcdce8c4SStefano Zampini   }
3159fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3160fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3161fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3162fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3163fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3164fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31659566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31669566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3168f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3169f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3170f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31729566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31739566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3174fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3175d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3176fcdce8c4SStefano Zampini     c->nz                = 0;
3177fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3178fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3179fcdce8c4SStefano Zampini     goto finalizesym;
3180fcdce8c4SStefano Zampini   }
3181fcdce8c4SStefano Zampini 
318228b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
318328b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3184fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3185fcdce8c4SStefano Zampini   if (!biscompressed) {
3186fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3187fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3188fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3189fcdce8c4SStefano Zampini #endif
3190fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3191fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3192fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3193fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3194fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3195fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3196fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3197fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3198fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3199fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3200fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
32019566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3202fcdce8c4SStefano Zampini     }
3203fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3204fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3205fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3206fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
32079371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32089371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3209fcdce8c4SStefano Zampini     }
3210fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3211fcdce8c4SStefano Zampini #endif
3212fcdce8c4SStefano Zampini   }
321328b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
321428b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3215fcdce8c4SStefano Zampini   /* precompute flops count */
3216fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3217fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3218fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3219fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3220fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3221fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3222fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3223fcdce8c4SStefano Zampini       }
3224fcdce8c4SStefano Zampini     }
3225fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3226fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3227fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3228fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3229fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3230fcdce8c4SStefano Zampini     }
3231fcdce8c4SStefano Zampini   } else { /* TODO */
3232fcdce8c4SStefano Zampini     flops = 0.;
3233fcdce8c4SStefano Zampini   }
3234fcdce8c4SStefano Zampini 
3235fcdce8c4SStefano Zampini   mmdata->flops = flops;
32369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3237b4285af6SJunchao Zhang 
3238fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32399566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32401ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32411ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32429371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3244b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3245b4285af6SJunchao Zhang   {
3246b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3247b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3248b4285af6SJunchao Zhang   */
3249b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3250b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3251b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3252b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3253b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3254b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3255b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3256b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3257b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3258b4285af6SJunchao Zhang 
3259b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32609371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32619371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3263b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32649371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32659371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3266b4285af6SJunchao Zhang 
32679371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32689371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32729371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32739371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32749566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32759566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3276b4285af6SJunchao Zhang 
3277b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32789566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3279b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3280b4285af6SJunchao Zhang     /* allocate matrix C */
32819371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32829371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32839371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32849371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3285b4285af6SJunchao Zhang     /* update matC with the new pointers */
32869371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32879371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3288b4285af6SJunchao Zhang 
32899371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32909371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32919566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32929371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32939371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32949566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32959371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32969371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32979566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3298b4285af6SJunchao Zhang   }
3299ae37ee31SJunchao Zhang   #else
3300b4285af6SJunchao Zhang   size_t bufSize2;
3301fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
33029371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
33039371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33049566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3305fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
33069371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
33079371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3308fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
33099371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
33109371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3311fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3312fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3313fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3314fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3315fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
33169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3317fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
33189371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
33199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3320fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
33219566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3322fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
33239371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
33249371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3325fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3327fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33289566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33299371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33309371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33319371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33329371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3333ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3334fcdce8c4SStefano Zampini #else
33359566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33369371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33379371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33389371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3339fcdce8c4SStefano Zampini   c->nz                = cnz;
3340fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33419566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3342fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33439566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3344fcdce8c4SStefano Zampini 
33459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3346fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3347fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3348fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33499371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33509371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33519371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3352fcdce8c4SStefano Zampini #endif
33539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3355fcdce8c4SStefano Zampini finalizesym:
3356fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33579f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33589f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3359fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33607de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3361fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3362fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3363fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3364fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3365fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3366fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3369fcdce8c4SStefano Zampini   } else {
3370fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3371fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3374fcdce8c4SStefano Zampini   }
3375fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3376fcdce8c4SStefano Zampini     PetscInt r = 0;
3377fcdce8c4SStefano Zampini     c->i[0]    = 0;
3378fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3379fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3380fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3381fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3382fcdce8c4SStefano Zampini     }
3383fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3384fcdce8c4SStefano Zampini   }
33859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33869566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3388fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3389fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3390fcdce8c4SStefano Zampini   c->rmax          = 0;
3391fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3392fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3393fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3394fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3395fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3396fcdce8c4SStefano Zampini   }
33979566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3399fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3400fcdce8c4SStefano Zampini 
3401fcdce8c4SStefano Zampini   C->nonzerostate++;
34029566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
34039566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3404fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3405fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3406fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3407fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3408fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3409abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3410fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3411fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3412fcdce8c4SStefano Zampini   }
3413fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
34143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3415fcdce8c4SStefano Zampini }
3416fcdce8c4SStefano Zampini 
3417fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3418fcdce8c4SStefano Zampini 
3419fcdce8c4SStefano Zampini /* handles sparse or dense B */
3420d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3421d71ae5a4SJacob Faibussowitsch {
3422fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3423fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3424fcdce8c4SStefano Zampini 
3425fcdce8c4SStefano Zampini   PetscFunctionBegin;
3426fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
34279566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
342848a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3429fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3430fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
343148a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3432fcdce8c4SStefano Zampini   }
343365e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
343465e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
343565e4b4d4SStefano Zampini     switch (product->type) {
343665e4b4d4SStefano Zampini     case MATPRODUCT_AB:
343765e4b4d4SStefano Zampini       if (product->api_user) {
3438d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34399566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3440d0609cedSBarry Smith         PetscOptionsEnd();
344165e4b4d4SStefano Zampini       } else {
3442d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34439566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3444d0609cedSBarry Smith         PetscOptionsEnd();
344565e4b4d4SStefano Zampini       }
344665e4b4d4SStefano Zampini       break;
344765e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
344865e4b4d4SStefano Zampini       if (product->api_user) {
3449d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34509566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3451d0609cedSBarry Smith         PetscOptionsEnd();
345265e4b4d4SStefano Zampini       } else {
3453d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34549566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3455d0609cedSBarry Smith         PetscOptionsEnd();
345665e4b4d4SStefano Zampini       }
345765e4b4d4SStefano Zampini       break;
345865e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
345965e4b4d4SStefano Zampini       if (product->api_user) {
3460d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34619566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3462d0609cedSBarry Smith         PetscOptionsEnd();
346365e4b4d4SStefano Zampini       } else {
3464d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34659566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3466d0609cedSBarry Smith         PetscOptionsEnd();
346765e4b4d4SStefano Zampini       }
346865e4b4d4SStefano Zampini       break;
346965e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
347065e4b4d4SStefano Zampini       if (product->api_user) {
3471d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34729566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3473d0609cedSBarry Smith         PetscOptionsEnd();
347465e4b4d4SStefano Zampini       } else {
3475d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34769566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3477d0609cedSBarry Smith         PetscOptionsEnd();
347865e4b4d4SStefano Zampini       }
347965e4b4d4SStefano Zampini       break;
348065e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
348165e4b4d4SStefano Zampini       if (product->api_user) {
3482d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34839566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3484d0609cedSBarry Smith         PetscOptionsEnd();
348565e4b4d4SStefano Zampini       } else {
3486d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34879566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3488d0609cedSBarry Smith         PetscOptionsEnd();
348965e4b4d4SStefano Zampini       }
349065e4b4d4SStefano Zampini       break;
3491d71ae5a4SJacob Faibussowitsch     default:
3492d71ae5a4SJacob Faibussowitsch       break;
349365e4b4d4SStefano Zampini     }
349465e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
349565e4b4d4SStefano Zampini   }
349665e4b4d4SStefano Zampini   /* dispatch */
3497fcdce8c4SStefano Zampini   if (isdense) {
3498ccdfe979SStefano Zampini     switch (product->type) {
3499ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3500ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3501ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3502ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3503ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3504fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
35059566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3506fcdce8c4SStefano Zampini       } else {
3507fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3508fcdce8c4SStefano Zampini       }
3509fcdce8c4SStefano Zampini       break;
3510d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3511d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3512d71ae5a4SJacob Faibussowitsch       break;
3513d71ae5a4SJacob Faibussowitsch     default:
3514d71ae5a4SJacob Faibussowitsch       break;
3515ccdfe979SStefano Zampini     }
3516fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3517fcdce8c4SStefano Zampini     switch (product->type) {
3518fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3519fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3520d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3521d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3522d71ae5a4SJacob Faibussowitsch       break;
3523fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3524fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3525d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3526d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3527d71ae5a4SJacob Faibussowitsch       break;
3528d71ae5a4SJacob Faibussowitsch     default:
3529d71ae5a4SJacob Faibussowitsch       break;
3530fcdce8c4SStefano Zampini     }
3531fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35329566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3533fcdce8c4SStefano Zampini   }
35343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3535ccdfe979SStefano Zampini }
3536ccdfe979SStefano Zampini 
3537d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538d71ae5a4SJacob Faibussowitsch {
35399ae82921SPaul Mullowney   PetscFunctionBegin;
35409566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3542e6e9a74fSStefano Zampini }
3543e6e9a74fSStefano Zampini 
3544d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3545d71ae5a4SJacob Faibussowitsch {
3546e6e9a74fSStefano Zampini   PetscFunctionBegin;
35479566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3549e6e9a74fSStefano Zampini }
3550e6e9a74fSStefano Zampini 
3551d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3552d71ae5a4SJacob Faibussowitsch {
3553e6e9a74fSStefano Zampini   PetscFunctionBegin;
35549566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3556e6e9a74fSStefano Zampini }
3557e6e9a74fSStefano Zampini 
3558d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3559d71ae5a4SJacob Faibussowitsch {
3560e6e9a74fSStefano Zampini   PetscFunctionBegin;
35619566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35639ae82921SPaul Mullowney }
35649ae82921SPaul Mullowney 
3565d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3566d71ae5a4SJacob Faibussowitsch {
3567ca45077fSPaul Mullowney   PetscFunctionBegin;
35689566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3570ca45077fSPaul Mullowney }
3571ca45077fSPaul Mullowney 
3572d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3573d71ae5a4SJacob Faibussowitsch {
3574a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3575a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3576a0e72f99SJunchao Zhang }
3577a0e72f99SJunchao Zhang 
3578afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3579d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3580d71ae5a4SJacob Faibussowitsch {
35819ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3582aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35839ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3584e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3585e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3586e6e9a74fSStefano Zampini   PetscBool                     compressed;
3587afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3588afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3589afb2bd1cSJunchao Zhang #endif
35906e111a19SKarl Rupp 
35919ae82921SPaul Mullowney   PetscFunctionBegin;
359208401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3593cbc6b225SStefano Zampini   if (!a->nz) {
3594995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3595995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35963ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3597e6e9a74fSStefano Zampini   }
359834d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3600e6e9a74fSStefano Zampini   if (!trans) {
36019ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
36025f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3603e6e9a74fSStefano Zampini   } else {
36041a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3605e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3606e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3607e6e9a74fSStefano Zampini     } else {
36089566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3609e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3610e6e9a74fSStefano Zampini     }
3611e6e9a74fSStefano Zampini   }
3612e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3613e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3614213423ffSJunchao Zhang 
3615e6e9a74fSStefano Zampini   try {
36169566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
361769d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
36189566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3619afb2bd1cSJunchao Zhang 
36209566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3621e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3622afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3623afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3624afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3625afb2bd1cSJunchao Zhang       */
3626e6e9a74fSStefano Zampini       xptr = xarray;
3627afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3628213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3629afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3631afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3632afb2bd1cSJunchao Zhang        */
3633afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3635fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3636afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3637afb2bd1cSJunchao Zhang       }
3638afb2bd1cSJunchao Zhang #endif
3639e6e9a74fSStefano Zampini     } else {
3640afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3641afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3642afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3643afb2bd1cSJunchao Zhang        */
3644afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3645e6e9a74fSStefano Zampini       dptr = zarray;
3646e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3647afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3648e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3649d0967f54SJacob Faibussowitsch 
3650d0967f54SJacob Faibussowitsch         thrust::for_each(
3651d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3652d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3653d0967f54SJacob Faibussowitsch #endif
3654d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36559371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3656e6e9a74fSStefano Zampini       }
3657afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3658afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3659afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3660fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3661afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3662afb2bd1cSJunchao Zhang       }
3663afb2bd1cSJunchao Zhang #endif
3664e6e9a74fSStefano Zampini     }
36659ae82921SPaul Mullowney 
3666afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3667aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3668afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3669fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3670fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3671fe5544b9SJunchao Zhang   #else
3672fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3673fe5544b9SJunchao Zhang   #endif
3674fe5544b9SJunchao Zhang 
36755f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3676fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3677fe5544b9SJunchao Zhang       if (!matDescr) {
3678fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3679fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3680fe5544b9SJunchao Zhang       }
3681fe5544b9SJunchao Zhang   #endif
3682fe5544b9SJunchao Zhang 
3683afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36869371c9d4SSatish Balay         PetscCallCUSPARSE(
3687fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3689fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3690fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3691fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3692fe5544b9SJunchao Zhang   #endif
3693afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3694afb2bd1cSJunchao Zhang       } else {
3695afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36969566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3698afb2bd1cSJunchao Zhang       }
3699afb2bd1cSJunchao Zhang 
3700fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3701afb2bd1cSJunchao Zhang #else
37027656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
37039371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3704afb2bd1cSJunchao Zhang #endif
3705aa372e3fSPaul Mullowney     } else {
3706213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3707afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3708afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3709afb2bd1cSJunchao Zhang #else
3710301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
37119371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3712afb2bd1cSJunchao Zhang #endif
3713a65300a6SPaul Mullowney       }
3714aa372e3fSPaul Mullowney     }
37159566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3716aa372e3fSPaul Mullowney 
3717e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3718213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3719213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3720995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3721e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3722995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
37237656d835SStefano Zampini         }
3724213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3725995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
37267656d835SStefano Zampini       }
37277656d835SStefano Zampini 
3728213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3729213423ffSJunchao Zhang       if (compressed) {
37309566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
37316497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37326497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
37339566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3734e6e9a74fSStefano Zampini       }
3735e6e9a74fSStefano Zampini     } else {
3736995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3737e6e9a74fSStefano Zampini     }
37389566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37399566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37409566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3741d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3742d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3743d71ae5a4SJacob Faibussowitsch   }
3744e6e9a74fSStefano Zampini   if (yy) {
37459566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3746e6e9a74fSStefano Zampini   } else {
37479566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3748e6e9a74fSStefano Zampini   }
37493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37509ae82921SPaul Mullowney }
37519ae82921SPaul Mullowney 
3752d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3753d71ae5a4SJacob Faibussowitsch {
3754ca45077fSPaul Mullowney   PetscFunctionBegin;
37559566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3757ca45077fSPaul Mullowney }
3758ca45077fSPaul Mullowney 
3759d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3760d71ae5a4SJacob Faibussowitsch {
3761042217e8SBarry Smith   PetscFunctionBegin;
37629566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37649ae82921SPaul Mullowney }
37659ae82921SPaul Mullowney 
3766e057df02SPaul Mullowney /*@
376711a5261eSBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
37682920cce0SJacob Faibussowitsch   (the default parallel PETSc format).
37699ae82921SPaul Mullowney 
3770d083f849SBarry Smith   Collective
37719ae82921SPaul Mullowney 
37729ae82921SPaul Mullowney   Input Parameters:
377311a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37749ae82921SPaul Mullowney . m    - number of rows
37759ae82921SPaul Mullowney . n    - number of columns
377620f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
377720f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37789ae82921SPaul Mullowney 
37799ae82921SPaul Mullowney   Output Parameter:
37809ae82921SPaul Mullowney . A - the matrix
37819ae82921SPaul Mullowney 
37822ef1f0ffSBarry Smith   Level: intermediate
37832ef1f0ffSBarry Smith 
37842ef1f0ffSBarry Smith   Notes:
37852920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37862920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
37872920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
37882920cce0SJacob Faibussowitsch 
378911a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
37909ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
379111a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
37929ae82921SPaul Mullowney 
379311a5261eSBarry Smith   The AIJ format, also called
37942ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
37959ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
379620f4b53cSBarry Smith   either one (as in Fortran) or zero.
37979ae82921SPaul Mullowney 
37989ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
37992ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
380020f4b53cSBarry Smith   allocation.
38019ae82921SPaul Mullowney 
3802fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
38039ae82921SPaul Mullowney @*/
3804d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3805d71ae5a4SJacob Faibussowitsch {
38069ae82921SPaul Mullowney   PetscFunctionBegin;
38079566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
38089566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
38099566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
38109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
38113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38129ae82921SPaul Mullowney }
38139ae82921SPaul Mullowney 
3814d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3815d71ae5a4SJacob Faibussowitsch {
38169ae82921SPaul Mullowney   PetscFunctionBegin;
38179ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
38182c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
38199ae82921SPaul Mullowney   } else {
38209566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3821aa372e3fSPaul Mullowney   }
38229566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38239566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38249566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38259566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38269566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38289566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38309566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38319566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38329566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38349ae82921SPaul Mullowney }
38359ae82921SPaul Mullowney 
3836ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
383795639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3838d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3839d71ae5a4SJacob Faibussowitsch {
38409ff858a8SKarl Rupp   PetscFunctionBegin;
38419566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38429566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38449ff858a8SKarl Rupp }
38459ff858a8SKarl Rupp 
3846d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3847d71ae5a4SJacob Faibussowitsch {
3848a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3849039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3850039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3851039c6fbaSStefano Zampini   PetscScalar        *ay;
3852039c6fbaSStefano Zampini   const PetscScalar  *ax;
3853039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3854e6e9a74fSStefano Zampini 
385595639643SRichard Tran Mills   PetscFunctionBegin;
3856a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3857a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3858039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38609566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38613ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
386295639643SRichard Tran Mills   }
3863039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38665f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38675f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3868039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3869039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3870039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3871039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3872039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3873ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3874039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3875039c6fbaSStefano Zampini   }
3876d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3877d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3878039c6fbaSStefano Zampini 
3879039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3880039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3881039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3882039c6fbaSStefano Zampini     size_t bufferSize;
3883039c6fbaSStefano Zampini     void  *buffer;
3884039c6fbaSStefano Zampini #endif
3885039c6fbaSStefano Zampini 
38869566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38889566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3889039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38909371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38919371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
38929566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
38939566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38949371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38959371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
38969566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38979566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38989566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3899039c6fbaSStefano Zampini #else
39009566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39019371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39029371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
39039566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39049566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3905039c6fbaSStefano Zampini #endif
39069566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
39079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39099566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3910039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3911a587d139SMark     cublasHandle_t cublasv2handle;
3912a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3913039c6fbaSStefano Zampini 
39149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39169566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39179566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
39189566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39199566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
39209566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39219566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39229566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3925039c6fbaSStefano Zampini   } else {
39269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39279566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3928a587d139SMark   }
39293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
393095639643SRichard Tran Mills }
393195639643SRichard Tran Mills 
3932d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3933d71ae5a4SJacob Faibussowitsch {
393433c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
393533c9ba73SStefano Zampini   PetscScalar   *ay;
393633c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
393733c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
393833c9ba73SStefano Zampini 
393933c9ba73SStefano Zampini   PetscFunctionBegin;
39409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39419566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39429566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39439566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39449566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39469566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
39493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
395033c9ba73SStefano Zampini }
395133c9ba73SStefano Zampini 
3952d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3953d71ae5a4SJacob Faibussowitsch {
39547e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3955a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
39567e8381f9SStefano Zampini 
39573fa6b06aSMark Adams   PetscFunctionBegin;
39583fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39593fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39607e8381f9SStefano Zampini     if (spptr->mat) {
39617e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39627e8381f9SStefano Zampini       if (matrix->values) {
39637e8381f9SStefano Zampini         both = PETSC_TRUE;
39647e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39657e8381f9SStefano Zampini       }
39667e8381f9SStefano Zampini     }
39677e8381f9SStefano Zampini     if (spptr->matTranspose) {
39687e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3969ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39707e8381f9SStefano Zampini     }
39713fa6b06aSMark Adams   }
39729566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
39739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39747e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3975a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
39763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39773fa6b06aSMark Adams }
39783fa6b06aSMark Adams 
3979d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3980d71ae5a4SJacob Faibussowitsch {
3981a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3982a587d139SMark 
3983a587d139SMark   PetscFunctionBegin;
39849a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
39859a14fc28SStefano Zampini     A->boundtocpu = flg;
39863ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
39879a14fc28SStefano Zampini   }
3988a587d139SMark   if (flg) {
39899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3990a587d139SMark 
399133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3992a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3993a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3994a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3995a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3996a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3997a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3998a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3999a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
4000fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
40019566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
40029566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
40039566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
40049566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
40059566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
40069566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
40079566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4008a587d139SMark   } else {
400933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4010a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4011a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4012a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4013a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4014a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4015a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4016a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4017a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4018fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
401967a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
402067a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
402167a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
402267a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
402367a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
402467a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40257ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40267ee59b9bSJunchao Zhang 
40279566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40289566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40299566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40309566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40319566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40329566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4033a587d139SMark   }
4034a587d139SMark   A->boundtocpu = flg;
40354d12350bSJunchao Zhang   if (flg && a->inode.size_csr) {
4036ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4037ea500dcfSRichard Tran Mills   } else {
4038ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4039ea500dcfSRichard Tran Mills   }
40403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4041a587d139SMark }
4042a587d139SMark 
40438eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4044d71ae5a4SJacob Faibussowitsch {
404549735bf3SStefano Zampini   Mat B;
40469ae82921SPaul Mullowney 
40479ae82921SPaul Mullowney   PetscFunctionBegin;
40489566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
404949735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40509566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
405149735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40529566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
405349735bf3SStefano Zampini   }
405449735bf3SStefano Zampini   B = *newmat;
405549735bf3SStefano Zampini 
40569566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40579566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
405834136279SStefano Zampini 
405949735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40609ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4061e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40629566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40639566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40649566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40651a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4066d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4067b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4068a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4069a435da06SStefano Zampini   #else
4070d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4071a435da06SStefano Zampini   #endif
4072d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4073d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4074d8132acaSStefano Zampini #endif
40751a2c6b5cSJunchao Zhang       B->spptr = spptr;
40769ae82921SPaul Mullowney     } else {
4077e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4078e6e9a74fSStefano Zampini 
40799566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40809566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40819566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4082e6e9a74fSStefano Zampini       B->spptr = spptr;
40839ae82921SPaul Mullowney     }
4084e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
408549735bf3SStefano Zampini   }
4086693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
40879ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
40881a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
40899ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
409095639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4091693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
40922205254eSKarl Rupp 
40939566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
40949566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
40959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4096ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
40979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4098ae48a8d0SStefano Zampini #endif
40999566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
41003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41019ae82921SPaul Mullowney }
41029ae82921SPaul Mullowney 
4103d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4104d71ae5a4SJacob Faibussowitsch {
410502fe1965SBarry Smith   PetscFunctionBegin;
41069566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
41079566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
41083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
410902fe1965SBarry Smith }
411002fe1965SBarry Smith 
41113ca39a21SBarry Smith /*MC
4112e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4113e057df02SPaul Mullowney 
411415229ffcSPierre Jolivet    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
411511a5261eSBarry Smith    CSR, ELL, or Hybrid format.
411611a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4117e057df02SPaul Mullowney 
4118e057df02SPaul Mullowney    Options Database Keys:
411911a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
41202ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41212ef1f0ffSBarry Smith                                       Other options include ell (ellpack) or hyb (hybrid).
41222ef1f0ffSBarry Smith .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
41232ef1f0ffSBarry Smith -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4124e057df02SPaul Mullowney 
4125e057df02SPaul Mullowney   Level: beginner
4126e057df02SPaul Mullowney 
41271cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4128e057df02SPaul Mullowney M*/
41297f756511SDominic Meiser 
4130d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4131d71ae5a4SJacob Faibussowitsch {
413242c9c57cSBarry Smith   PetscFunctionBegin;
41339566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41349566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41359566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41369566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
413842c9c57cSBarry Smith }
413929b38603SBarry Smith 
41402c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4141d71ae5a4SJacob Faibussowitsch {
41422c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4143cbc6b225SStefano Zampini 
4144cbc6b225SStefano Zampini   PetscFunctionBegin;
41452c4ab24aSJunchao Zhang   if (cusp) {
41462c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41472c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41482c4ab24aSJunchao Zhang     delete cusp->workVector;
41492c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41502c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41512c4ab24aSJunchao Zhang     delete cusp->coords;
41522c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41532c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41547f756511SDominic Meiser   }
41553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41567f756511SDominic Meiser }
41577f756511SDominic Meiser 
4158d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4159d71ae5a4SJacob Faibussowitsch {
41607f756511SDominic Meiser   PetscFunctionBegin;
41617f756511SDominic Meiser   if (*mat) {
41627f756511SDominic Meiser     delete (*mat)->values;
41637f756511SDominic Meiser     delete (*mat)->column_indices;
41647f756511SDominic Meiser     delete (*mat)->row_offsets;
41657f756511SDominic Meiser     delete *mat;
41667f756511SDominic Meiser     *mat = 0;
41677f756511SDominic Meiser   }
41683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41697f756511SDominic Meiser }
41707f756511SDominic Meiser 
4171b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4172d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4173d71ae5a4SJacob Faibussowitsch {
41747f756511SDominic Meiser   PetscFunctionBegin;
41757f756511SDominic Meiser   if (*trifactor) {
41769566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4177261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
41789566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
41799566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
41809566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4181afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
41829566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4183afb2bd1cSJunchao Zhang   #endif
41849566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
41857f756511SDominic Meiser   }
41863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41877f756511SDominic Meiser }
4188d460d7bfSJunchao Zhang #endif
41897f756511SDominic Meiser 
4190d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4191d71ae5a4SJacob Faibussowitsch {
41927f756511SDominic Meiser   CsrMatrix *mat;
41937f756511SDominic Meiser 
41947f756511SDominic Meiser   PetscFunctionBegin;
41957f756511SDominic Meiser   if (*matstruct) {
41967f756511SDominic Meiser     if ((*matstruct)->mat) {
41977f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4198afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4199afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4200afb2bd1cSJunchao Zhang #else
42017f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
42029566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4203afb2bd1cSJunchao Zhang #endif
42047f756511SDominic Meiser       } else {
42057f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
42063ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
42077f756511SDominic Meiser       }
42087f756511SDominic Meiser     }
42099566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
42107f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
42119566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
42129566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42139566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4214afb2bd1cSJunchao Zhang 
4215afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42179566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4218fe5544b9SJunchao Zhang 
4219afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4220afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42219566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4224fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4225fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4226fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4227fe5544b9SJunchao Zhang   #endif
4228afb2bd1cSJunchao Zhang       }
4229afb2bd1cSJunchao Zhang     }
4230afb2bd1cSJunchao Zhang #endif
42317f756511SDominic Meiser     delete *matstruct;
42327e8381f9SStefano Zampini     *matstruct = NULL;
42337f756511SDominic Meiser   }
42343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42357f756511SDominic Meiser }
42367f756511SDominic Meiser 
4237d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4238d71ae5a4SJacob Faibussowitsch {
4239da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4240da112707SJunchao Zhang 
42417f756511SDominic Meiser   PetscFunctionBegin;
4242da112707SJunchao Zhang   if (fs) {
4243b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4244da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4245da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4246da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4247da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4248d460d7bfSJunchao Zhang     delete fs->workVector;
4249d460d7bfSJunchao Zhang     fs->workVector = NULL;
4250d460d7bfSJunchao Zhang #endif
4251da112707SJunchao Zhang     delete fs->rpermIndices;
4252da112707SJunchao Zhang     delete fs->cpermIndices;
4253da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4254da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4255da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4256b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4257da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4258da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
425930807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
426030807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4261da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4262d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4263da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4264da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
426512ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4266da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4267da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
426812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4269da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4270da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4271da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4272da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4273da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4274da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4275da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4276da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4277da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4278da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4279da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4280da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4281d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4282d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4283d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
428412ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
428512ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4286da112707SJunchao Zhang #endif
4287ccdfe979SStefano Zampini   }
42883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4289ccdfe979SStefano Zampini }
4290ccdfe979SStefano Zampini 
4291d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4292d71ae5a4SJacob Faibussowitsch {
4293ccdfe979SStefano Zampini   PetscFunctionBegin;
4294ccdfe979SStefano Zampini   if (*trifactors) {
42959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4296f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
42979566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
42987f756511SDominic Meiser   }
42993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43007f756511SDominic Meiser }
43017e8381f9SStefano Zampini 
43029371c9d4SSatish Balay struct IJCompare {
4303d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4304d71ae5a4SJacob Faibussowitsch   {
43050b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
43060b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
43077e8381f9SStefano Zampini     return false;
43087e8381f9SStefano Zampini   }
43097e8381f9SStefano Zampini };
43107e8381f9SStefano Zampini 
431166976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4312d71ae5a4SJacob Faibussowitsch {
4313a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4314a49f1ed0SStefano Zampini 
4315a49f1ed0SStefano Zampini   PetscFunctionBegin;
4316a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43173ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4318a49f1ed0SStefano Zampini   if (destroy) {
43199566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4320a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4321a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4322a49f1ed0SStefano Zampini   }
43231a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4325a49f1ed0SStefano Zampini }
4326a49f1ed0SStefano Zampini 
432749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4328d71ae5a4SJacob Faibussowitsch {
432949abdd8aSBarry Smith   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
43304d86920dSPierre Jolivet 
43317e8381f9SStefano Zampini   PetscFunctionBegin;
43322c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43332c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43342c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43367e8381f9SStefano Zampini }
4337ed502f03SStefano Zampini 
433866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4339d71ae5a4SJacob Faibussowitsch {
43402c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43412c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43422c4ab24aSJunchao Zhang   PetscInt            *i, *j;
434303e76207SPierre Jolivet   PetscContainer       container_h;
43442c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4345219fbbafSJunchao Zhang 
4346219fbbafSJunchao Zhang   PetscFunctionBegin;
43479566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43482c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43492c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43502c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43512c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43522c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43532c4ab24aSJunchao Zhang   } else {
43542c4ab24aSJunchao Zhang     i = coo_i;
43552c4ab24aSJunchao Zhang     j = coo_j;
4356219fbbafSJunchao Zhang   }
4357219fbbafSJunchao Zhang 
43582c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43592c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4360cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43612c4ab24aSJunchao Zhang   // Create the GPU memory
43629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43632c4ab24aSJunchao Zhang 
43642c4ab24aSJunchao Zhang   // Copy the COO struct to device
43652c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
43662c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
43672c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
43682c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
43692c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
43702c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
43712c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
43722c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
43732c4ab24aSJunchao Zhang 
43742c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
437503e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
43763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4377219fbbafSJunchao Zhang }
4378219fbbafSJunchao Zhang 
4379d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4380d71ae5a4SJacob Faibussowitsch {
4381219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4382219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4383b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4384b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4385b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4386b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4387b6c38306SJunchao Zhang   }
4388219fbbafSJunchao Zhang }
4389219fbbafSJunchao Zhang 
439066976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4391d71ae5a4SJacob Faibussowitsch {
4392219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4393219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4394219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4395219fbbafSJunchao Zhang   PetscMemType         memtype;
4396219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4397219fbbafSJunchao Zhang   PetscScalar         *Aa;
43982c4ab24aSJunchao Zhang   PetscContainer       container;
43992c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4400219fbbafSJunchao Zhang 
4401219fbbafSJunchao Zhang   PetscFunctionBegin;
44022c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44032c4ab24aSJunchao Zhang 
44042c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
44052c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
44062c4ab24aSJunchao Zhang 
44079566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4408219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44092c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
44102c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4411219fbbafSJunchao Zhang   }
4412219fbbafSJunchao Zhang 
44139566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44149566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4415219fbbafSJunchao Zhang 
441608bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4417cbc6b225SStefano Zampini   if (Annz) {
44186497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4420cbc6b225SStefano Zampini   }
442108bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4422219fbbafSJunchao Zhang 
44239566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44249566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4425219fbbafSJunchao Zhang 
44269566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4428219fbbafSJunchao Zhang }
4429219fbbafSJunchao Zhang 
44305b7e41feSStefano Zampini /*@C
44312ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44325b7e41feSStefano Zampini 
44332ef1f0ffSBarry Smith   Not Collective
44345b7e41feSStefano Zampini 
44355b7e41feSStefano Zampini   Input Parameters:
44365b7e41feSStefano Zampini + A          - the matrix
443711a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44385b7e41feSStefano Zampini 
44395b7e41feSStefano Zampini   Output Parameters:
444020f4b53cSBarry Smith + i - the CSR row pointers
444120f4b53cSBarry Smith - j - the CSR column indices
44425b7e41feSStefano Zampini 
44435b7e41feSStefano Zampini   Level: developer
44445b7e41feSStefano Zampini 
444511a5261eSBarry Smith   Note:
44465b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44475b7e41feSStefano Zampini 
44481cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44495b7e41feSStefano Zampini @*/
4450d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4451d71ae5a4SJacob Faibussowitsch {
44525f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44535f101d05SStefano Zampini   CsrMatrix          *csr;
44545f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44555f101d05SStefano Zampini 
44565f101d05SStefano Zampini   PetscFunctionBegin;
44575f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44583ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44595f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4460aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
446228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44635f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44645f101d05SStefano Zampini   if (i) {
44655f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44665f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44675f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44685f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44699566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
44705f101d05SStefano Zampini       }
44715f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
44725f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
44735f101d05SStefano Zampini   }
44745f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
44753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44765f101d05SStefano Zampini }
44775f101d05SStefano Zampini 
44785b7e41feSStefano Zampini /*@C
44792ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
44805b7e41feSStefano Zampini 
44812ef1f0ffSBarry Smith   Not Collective
44825b7e41feSStefano Zampini 
44835b7e41feSStefano Zampini   Input Parameters:
44845b7e41feSStefano Zampini + A          - the matrix
44852ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
448620f4b53cSBarry Smith . i          - the CSR row pointers
448720f4b53cSBarry Smith - j          - the CSR column indices
44885b7e41feSStefano Zampini 
44895b7e41feSStefano Zampini   Level: developer
44905b7e41feSStefano Zampini 
44911cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
44925b7e41feSStefano Zampini @*/
449320f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4494d71ae5a4SJacob Faibussowitsch {
44955f101d05SStefano Zampini   PetscFunctionBegin;
44965f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44975f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44985f101d05SStefano Zampini   if (i) *i = NULL;
44995f101d05SStefano Zampini   if (j) *j = NULL;
450020f4b53cSBarry Smith   (void)compressed;
45013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45025f101d05SStefano Zampini }
45035f101d05SStefano Zampini 
45045b7e41feSStefano Zampini /*@C
450511a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45065b7e41feSStefano Zampini 
45075b7e41feSStefano Zampini   Not Collective
45085b7e41feSStefano Zampini 
45095b7e41feSStefano Zampini   Input Parameter:
451011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45115b7e41feSStefano Zampini 
45125b7e41feSStefano Zampini   Output Parameter:
45135b7e41feSStefano Zampini . a - pointer to the device data
45145b7e41feSStefano Zampini 
45155b7e41feSStefano Zampini   Level: developer
45165b7e41feSStefano Zampini 
451711a5261eSBarry Smith   Note:
451811a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45195b7e41feSStefano Zampini 
45201cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45215b7e41feSStefano Zampini @*/
4522d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4523d71ae5a4SJacob Faibussowitsch {
4524ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4525ed502f03SStefano Zampini   CsrMatrix          *csr;
4526ed502f03SStefano Zampini 
4527ed502f03SStefano Zampini   PetscFunctionBegin;
4528ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45294f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4530ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4531aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
453328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
453528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536ed502f03SStefano Zampini   *a = csr->values->data().get();
45373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4538ed502f03SStefano Zampini }
4539ed502f03SStefano Zampini 
45405b7e41feSStefano Zampini /*@C
454111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45425b7e41feSStefano Zampini 
45435b7e41feSStefano Zampini   Not Collective
45445b7e41feSStefano Zampini 
45452ef1f0ffSBarry Smith   Input Parameters:
45462ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45472ef1f0ffSBarry Smith - a - pointer to the device data
45485b7e41feSStefano Zampini 
45495b7e41feSStefano Zampini   Level: developer
45505b7e41feSStefano Zampini 
45511cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45525b7e41feSStefano Zampini @*/
4553d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4554d71ae5a4SJacob Faibussowitsch {
4555ed502f03SStefano Zampini   PetscFunctionBegin;
4556ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45574f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4558ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4559ed502f03SStefano Zampini   *a = NULL;
45603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4561ed502f03SStefano Zampini }
4562ed502f03SStefano Zampini 
45635b7e41feSStefano Zampini /*@C
456411a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45655b7e41feSStefano Zampini 
45665b7e41feSStefano Zampini   Not Collective
45675b7e41feSStefano Zampini 
45685b7e41feSStefano Zampini   Input Parameter:
456911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45705b7e41feSStefano Zampini 
45715b7e41feSStefano Zampini   Output Parameter:
45725b7e41feSStefano Zampini . a - pointer to the device data
45735b7e41feSStefano Zampini 
45745b7e41feSStefano Zampini   Level: developer
45755b7e41feSStefano Zampini 
457611a5261eSBarry Smith   Note:
457711a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45785b7e41feSStefano Zampini 
45791cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
45805b7e41feSStefano Zampini @*/
4581d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4582d71ae5a4SJacob Faibussowitsch {
4583039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4584039c6fbaSStefano Zampini   CsrMatrix          *csr;
4585039c6fbaSStefano Zampini 
4586039c6fbaSStefano Zampini   PetscFunctionBegin;
4587039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45884f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4589039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
459228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
459428b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4596039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
45983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4599039c6fbaSStefano Zampini }
46005b7e41feSStefano Zampini /*@C
460111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4602039c6fbaSStefano Zampini 
46035b7e41feSStefano Zampini   Not Collective
46045b7e41feSStefano Zampini 
46052ef1f0ffSBarry Smith   Input Parameters:
46062ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46072ef1f0ffSBarry Smith - a - pointer to the device data
46085b7e41feSStefano Zampini 
46095b7e41feSStefano Zampini   Level: developer
46105b7e41feSStefano Zampini 
46111cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46125b7e41feSStefano Zampini @*/
4613d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4614d71ae5a4SJacob Faibussowitsch {
4615039c6fbaSStefano Zampini   PetscFunctionBegin;
4616039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46174f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4618039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46209566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4621039c6fbaSStefano Zampini   *a = NULL;
46223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4623039c6fbaSStefano Zampini }
4624039c6fbaSStefano Zampini 
46255b7e41feSStefano Zampini /*@C
462611a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46275b7e41feSStefano Zampini 
46285b7e41feSStefano Zampini   Not Collective
46295b7e41feSStefano Zampini 
46305b7e41feSStefano Zampini   Input Parameter:
463111a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46325b7e41feSStefano Zampini 
46335b7e41feSStefano Zampini   Output Parameter:
46345b7e41feSStefano Zampini . a - pointer to the device data
46355b7e41feSStefano Zampini 
46365b7e41feSStefano Zampini   Level: developer
46375b7e41feSStefano Zampini 
463811a5261eSBarry Smith   Note:
463911a5261eSBarry Smith   Does not trigger host-device copies and flags data validity on the GPU
46405b7e41feSStefano Zampini 
46411cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46425b7e41feSStefano Zampini @*/
4643d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4644d71ae5a4SJacob Faibussowitsch {
4645ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4646ed502f03SStefano Zampini   CsrMatrix          *csr;
4647ed502f03SStefano Zampini 
4648ed502f03SStefano Zampini   PetscFunctionBegin;
4649ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46504f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4651ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4652aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
465328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4654ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
465528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4656ed502f03SStefano Zampini   *a             = csr->values->data().get();
4657039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4660ed502f03SStefano Zampini }
4661ed502f03SStefano Zampini 
46625b7e41feSStefano Zampini /*@C
466311a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46645b7e41feSStefano Zampini 
46655b7e41feSStefano Zampini   Not Collective
46665b7e41feSStefano Zampini 
46672ef1f0ffSBarry Smith   Input Parameters:
46682ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46692ef1f0ffSBarry Smith - a - pointer to the device data
46705b7e41feSStefano Zampini 
46715b7e41feSStefano Zampini   Level: developer
46725b7e41feSStefano Zampini 
46731cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
46745b7e41feSStefano Zampini @*/
4675d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4676d71ae5a4SJacob Faibussowitsch {
4677ed502f03SStefano Zampini   PetscFunctionBegin;
4678ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46794f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4680ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46829566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4683ed502f03SStefano Zampini   *a = NULL;
46843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4685ed502f03SStefano Zampini }
4686ed502f03SStefano Zampini 
46879371c9d4SSatish Balay struct IJCompare4 {
4688d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4689d71ae5a4SJacob Faibussowitsch   {
46900b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
46910b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4692ed502f03SStefano Zampini     return false;
4693ed502f03SStefano Zampini   }
4694ed502f03SStefano Zampini };
4695ed502f03SStefano Zampini 
46969371c9d4SSatish Balay struct Shift {
4697ed502f03SStefano Zampini   int _shift;
4698ed502f03SStefano Zampini 
4699ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
47009371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4701ed502f03SStefano Zampini };
4702ed502f03SStefano Zampini 
470321afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4704d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4705d71ae5a4SJacob Faibussowitsch {
4706ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4707ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4708ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4709ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4710ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4711ed502f03SStefano Zampini   cusparseStatus_t              stat;
4712ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4713ed502f03SStefano Zampini 
4714ed502f03SStefano Zampini   PetscFunctionBegin;
4715ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4716ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47174f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4718ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4719ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47205f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
472108401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4722aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4723aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4724ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4725ed502f03SStefano Zampini     m = A->rmap->n;
4726ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47279566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47289566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47299566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4730ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4731ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4732ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4733ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4734ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4735ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4736ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4737ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4738ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4739ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4740ed502f03SStefano Zampini     Ccusp->nrows            = m;
4741ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4742ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4743ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4744ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47459566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47469566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47479566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4748f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4749f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4750f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47559566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
475628b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
475728b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4758ed502f03SStefano Zampini 
4759ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4760ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4761ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4762ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4763ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4764ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4765ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4766ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4767ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
47682c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4769ed502f03SStefano Zampini     if (c->nz) {
47702ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
47712ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
47722ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
47732ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
47742ed87e7eSStefano Zampini 
4775ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4776ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4777ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4778ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
47799566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4780ed502f03SStefano Zampini         }
47812ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
47822ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4783ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4784ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4785ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4786ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
47879566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4788ed502f03SStefano Zampini         }
47892ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
47902ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
47919566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
47929371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47939371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47949371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47959371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47962ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
47972ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
47982ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
47998909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4800ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4801ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
48028909a122SStefano Zampini #else
48038909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
48048909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48058909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48068909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48078909a122SStefano Zampini #endif
48082ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48092ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48102ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48112ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48122ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48132ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48142c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48152c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4816ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4817792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48188909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48198909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48208909a122SStefano Zampini #endif
48212ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48222ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48232ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4824792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48252ed87e7eSStefano Zampini #else
4826*59c3d2bbSPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
48272ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4828*59c3d2bbSPierre Jolivet   #else
4829*59c3d2bbSPierre Jolivet       auto pred = cuda::std::identity();
4830*59c3d2bbSPierre Jolivet   #endif
4831792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4832792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48332ed87e7eSStefano Zampini #endif
48349371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48359371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48369566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48372ed87e7eSStefano Zampini       delete wPerm;
48382ed87e7eSStefano Zampini       delete Acoo;
48392ed87e7eSStefano Zampini       delete Bcoo;
48402ed87e7eSStefano Zampini       delete Ccoo;
4841ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48429371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48439371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4844ed502f03SStefano Zampini #endif
48451a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48469566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48479566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4848ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4849ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4850ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4851ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4852ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4853ed502f03SStefano Zampini 
48541a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48551a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4856a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4857ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4858ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4859ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4860ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4861ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4862ed502f03SStefano Zampini 
4863ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4864ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4865ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4866ed502f03SStefano Zampini 
48679566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4868ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4869ed502f03SStefano Zampini         if (AT) {
4870ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4871ed502f03SStefano Zampini           thrust::advance(rT, -1);
4872ed502f03SStefano Zampini         }
4873ed502f03SStefano Zampini         if (BT) {
4874ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4875ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4876ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4877ed502f03SStefano Zampini         }
4878ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4879ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4880ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4881ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4882ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4883ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48849566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4885ed502f03SStefano Zampini 
48869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
48879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
48889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4889f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4890f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4891f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
48929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48939566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48949566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4895ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48969371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48979371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4898ed502f03SStefano Zampini #endif
4899ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4900ed502f03SStefano Zampini       }
4901ed502f03SStefano Zampini     }
4902ed502f03SStefano Zampini 
4903ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
49049f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
49059f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4906ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
49077de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4908ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4909ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4910ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4911ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49129566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4914ed502f03SStefano Zampini     } else {
49159566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49169566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917ed502f03SStefano Zampini     }
49189566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49199566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49209566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4921ed502f03SStefano Zampini     c->maxnz         = c->nz;
4922ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4923ed502f03SStefano Zampini     c->rmax          = 0;
4924ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4925ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4926ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4927ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4928ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4929ed502f03SStefano Zampini     }
49309566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
49319566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4932ed502f03SStefano Zampini     (*C)->nonzerostate++;
49339566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49349566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4935ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4936ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4937ed502f03SStefano Zampini   } else {
493808401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4939ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4940ed502f03SStefano Zampini     if (c->nz) {
4941ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49422c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4943aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
494408401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49475f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49485f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4949ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4950ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4951ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4952aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4953aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4954aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4955aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49562c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49572c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4958ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49599566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49602c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49619371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4962ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49639371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49642c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4965ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49669566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49671a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49685f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4969ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4970ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4971ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4972ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4973ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4974ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4975ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49761a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4977ed502f03SStefano Zampini       }
49789566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4979ed502f03SStefano Zampini     }
4980ed502f03SStefano Zampini   }
49819566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4982ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4983ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4984ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
49853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4986ed502f03SStefano Zampini }
4987c215019aSStefano Zampini 
4988d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4989d71ae5a4SJacob Faibussowitsch {
4990c215019aSStefano Zampini   bool               dmem;
4991c215019aSStefano Zampini   const PetscScalar *av;
4992c215019aSStefano Zampini 
4993c215019aSStefano Zampini   PetscFunctionBegin;
4994c215019aSStefano Zampini   dmem = isCudaMem(v);
49959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4996c215019aSStefano Zampini   if (n && idx) {
4997c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4998c215019aSStefano Zampini     widx.assign(idx, idx + n);
49999566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5000c215019aSStefano Zampini 
5001c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5002c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5003c215019aSStefano Zampini     if (dmem) {
5004c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5005c215019aSStefano Zampini     } else {
5006c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5007c215019aSStefano Zampini       dv = w->data();
5008c215019aSStefano Zampini     }
5009c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5010c215019aSStefano Zampini 
5011c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5012c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5013c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
501448a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5015c215019aSStefano Zampini     delete w;
5016c215019aSStefano Zampini   } else {
50179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5018c215019aSStefano Zampini   }
50199566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5022c215019aSStefano Zampini }
5023b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
5024