xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 29d3d2f85b004f78dba4877067ddf368d9ccb527)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14
18d0967f54SJacob Faibussowitsch #endif
19a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
20a2cee5feSJed Brown #include <thrust/remove.h>
21a2cee5feSJed Brown #include <thrust/sort.h>
22a2cee5feSJed Brown #include <thrust/unique.h>
2359c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
2459c3d2bbSPierre Jolivet   #include <cuda/std/functional>
2559c3d2bbSPierre Jolivet #endif
26e8d2b73aSMark Adams 
27e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2946aba097SBarry Smith /*
3046aba097SBarry Smith   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31afb2bd1cSJunchao Zhang   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32afb2bd1cSJunchao Zhang */
33afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36afb2bd1cSJunchao Zhang #endif
379ae82921SPaul Mullowney 
38087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
416fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
436fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
466fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48d460d7bfSJunchao Zhang #endif
49ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
5133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
526fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
536fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
546fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
599ae82921SPaul Mullowney 
607f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
632c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
647f756511SDominic Meiser 
6557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
6757181aedSStefano Zampini 
68c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71c215019aSStefano Zampini 
72d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73d71ae5a4SJacob Faibussowitsch {
74aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
756e111a19SKarl Rupp 
76ca45077fSPaul Mullowney   PetscFunctionBegin;
77ca45077fSPaul Mullowney   switch (op) {
78d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
79d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
80d71ae5a4SJacob Faibussowitsch     break;
81d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
82d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
83d71ae5a4SJacob Faibussowitsch     break;
84d71ae5a4SJacob Faibussowitsch   default:
85d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86ca45077fSPaul Mullowney   }
873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
88ca45077fSPaul Mullowney }
899ae82921SPaul Mullowney 
90e057df02SPaul Mullowney /*@
9111a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
9211a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
9311a5261eSBarry Smith 
94e057df02SPaul Mullowney   Not Collective
95e057df02SPaul Mullowney 
96e057df02SPaul Mullowney   Input Parameters:
9711a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
982ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
992ef1f0ffSBarry Smith            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
10011a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101e057df02SPaul Mullowney 
102e057df02SPaul Mullowney   Level: intermediate
103e057df02SPaul Mullowney 
104fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105e057df02SPaul Mullowney @*/
106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107d71ae5a4SJacob Faibussowitsch {
108e057df02SPaul Mullowney   PetscFunctionBegin;
109e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112e057df02SPaul Mullowney }
113e057df02SPaul Mullowney 
114d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115d71ae5a4SJacob Faibussowitsch {
116365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117365b711fSMark Adams 
118365b711fSMark Adams   PetscFunctionBegin;
119365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
121365b711fSMark Adams }
122365b711fSMark Adams 
123365b711fSMark Adams /*@
12411a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125365b711fSMark Adams 
126365b711fSMark Adams   Input Parameters:
12711a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
12811a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
129365b711fSMark Adams 
1302ef1f0ffSBarry Smith   Level: intermediate
131365b711fSMark Adams 
13211a5261eSBarry Smith   Note:
13353220ed8SBarry Smith   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
13453220ed8SBarry Smith   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136365b711fSMark Adams 
1371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138365b711fSMark Adams @*/
139d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140d71ae5a4SJacob Faibussowitsch {
141365b711fSMark Adams   PetscFunctionBegin;
142365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145365b711fSMark Adams }
146365b711fSMark Adams 
14766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148d71ae5a4SJacob Faibussowitsch {
149e6e9a74fSStefano Zampini   PetscFunctionBegin;
1501a2c6b5cSJunchao Zhang   switch (op) {
1511a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1521a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1539566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1541a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1551a2c6b5cSJunchao Zhang     break;
156d71ae5a4SJacob Faibussowitsch   default:
157d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158d71ae5a4SJacob Faibussowitsch     break;
159e6e9a74fSStefano Zampini   }
1603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
161e6e9a74fSStefano Zampini }
162e6e9a74fSStefano Zampini 
163ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164d71ae5a4SJacob Faibussowitsch {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1669ae82921SPaul Mullowney   PetscBool                flg;
167a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1686e111a19SKarl Rupp 
1699ae82921SPaul Mullowney   PetscFunctionBegin;
170d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1719ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1729371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1739566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174afb2bd1cSJunchao Zhang 
1759371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1769566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
1779566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
1789566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1809371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184a435da06SStefano Zampini   #else
185aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186a435da06SStefano Zampini   #endif
1879371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189afb2bd1cSJunchao Zhang 
1909371c9d4SSatish Balay     PetscCall(
1919371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193afb2bd1cSJunchao Zhang #endif
1944c87dfd4SPaul Mullowney   }
195d0609cedSBarry Smith   PetscOptionsHeadEnd();
1963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
199b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201d460d7bfSJunchao Zhang {
202d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
204d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
206d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
207d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
208d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
209d460d7bfSJunchao Zhang 
210d460d7bfSJunchao Zhang   PetscFunctionBegin;
211d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
212d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
213d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
214d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
215d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
216d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
217d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
218d460d7bfSJunchao Zhang       Mi[0] = 0;
219d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
220d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
221d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
222d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
223d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
224d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
225d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
226d460d7bfSJunchao Zhang       }
227d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
228f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
229f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
230f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
231f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
232f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
233d460d7bfSJunchao Zhang 
234d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
235d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
236d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
237d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
238d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
239d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
240d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
241d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
242d460d7bfSJunchao Zhang 
243d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
244d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
245d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
246d460d7bfSJunchao Zhang 
247d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
248d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
249d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
250d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
251d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
252d460d7bfSJunchao Zhang 
253d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
254f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
256d460d7bfSJunchao Zhang 
257d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
258d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
259d460d7bfSJunchao Zhang 
260d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
261d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
262d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
263d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
264d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
265d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
266d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
267d460d7bfSJunchao Zhang 
268d460d7bfSJunchao Zhang       // Record for reuse
269d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
270d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
271d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
272d460d7bfSJunchao Zhang     }
273d460d7bfSJunchao Zhang     // Copy the value
274d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
275d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
276d460d7bfSJunchao Zhang     Mnz = Mi[m];
277d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
278d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
279d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
280d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
281d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
282d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
283d460d7bfSJunchao Zhang     }
284d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
285d460d7bfSJunchao Zhang 
286204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
287204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
288204a0e31SJunchao Zhang       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
289204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
290204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291204a0e31SJunchao Zhang     } else
292204a0e31SJunchao Zhang   #endif
293204a0e31SJunchao Zhang     {
294d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
295d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
296d460d7bfSJunchao Zhang 
297d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
298204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis          = PETSC_TRUE;
299d460d7bfSJunchao Zhang       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
300d460d7bfSJunchao Zhang     }
301204a0e31SJunchao Zhang   }
302d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
303d460d7bfSJunchao Zhang }
304d460d7bfSJunchao Zhang #else
305d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
306d71ae5a4SJacob Faibussowitsch {
3079ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3089ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
310aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3119ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3129ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3139ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3149ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3159ae82921SPaul Mullowney 
3169ae82921SPaul Mullowney   PetscFunctionBegin;
3173ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
318c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3199ae82921SPaul Mullowney     try {
3209ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3219ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
322da79fbbcSStefano Zampini       if (!loTriFactor) {
3232cbc15d9SMark         PetscScalar *AALo;
3242cbc15d9SMark 
3259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3269ae82921SPaul Mullowney 
3279ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3309ae82921SPaul Mullowney 
3319ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3329ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3339ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3349ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3359ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3369ae82921SPaul Mullowney         v         = aa;
3379ae82921SPaul Mullowney         vi        = aj;
3389ae82921SPaul Mullowney         offset    = 1;
3399ae82921SPaul Mullowney         rowOffset = 1;
3409ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3419ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
342e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3439ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3449ae82921SPaul Mullowney           rowOffset += nz + 1;
3459ae82921SPaul Mullowney 
346f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
347f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3489ae82921SPaul Mullowney 
3499ae82921SPaul Mullowney           offset += nz;
3509ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3519ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3529ae82921SPaul Mullowney           offset += 1;
3539ae82921SPaul Mullowney 
3549ae82921SPaul Mullowney           v += nz;
3559ae82921SPaul Mullowney           vi += nz;
3569ae82921SPaul Mullowney         }
3572205254eSKarl Rupp 
358aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3599566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
360da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
361aa372e3fSPaul Mullowney         /* Create the matrix description */
3629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3641b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
366afb2bd1cSJunchao Zhang   #else
3679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
368afb2bd1cSJunchao Zhang   #endif
3699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
371aa372e3fSPaul Mullowney 
372aa372e3fSPaul Mullowney         /* set the operation */
373aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
374aa372e3fSPaul Mullowney 
375aa372e3fSPaul Mullowney         /* set the matrix */
376aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
377aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
378aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
379aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
380aa372e3fSPaul Mullowney 
381aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
382aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
383aa372e3fSPaul Mullowney 
384aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
385aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
386aa372e3fSPaul Mullowney 
387aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
388aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
389aa372e3fSPaul Mullowney 
390afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3919566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
392261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3931b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3949371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3959371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
397afb2bd1cSJunchao Zhang   #endif
398afb2bd1cSJunchao Zhang 
399aa372e3fSPaul Mullowney         /* perform the solve analysis */
4009371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4019f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4029566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4039566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
404aa372e3fSPaul Mullowney 
405da79fbbcSStefano Zampini         /* assign the pointer */
406aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4072cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4089566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4099566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4109566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
411da79fbbcSStefano Zampini       } else { /* update values only */
41248a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
413da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4142cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
415da79fbbcSStefano Zampini         v                    = aa;
416da79fbbcSStefano Zampini         vi                   = aj;
417da79fbbcSStefano Zampini         offset               = 1;
418da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
419da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
420f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
421da79fbbcSStefano Zampini           offset += nz;
4222cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
423da79fbbcSStefano Zampini           offset += 1;
424da79fbbcSStefano Zampini           v += nz;
425da79fbbcSStefano Zampini         }
4262cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4279566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
428da79fbbcSStefano Zampini       }
429d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
430d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
431d71ae5a4SJacob Faibussowitsch     }
4329ae82921SPaul Mullowney   }
4333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4349ae82921SPaul Mullowney }
4359ae82921SPaul Mullowney 
436d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
437d71ae5a4SJacob Faibussowitsch {
4389ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4399ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4409ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
441aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4429ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4439ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4449ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4459ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4469ae82921SPaul Mullowney 
4479ae82921SPaul Mullowney   PetscFunctionBegin;
4483ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
449c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4509ae82921SPaul Mullowney     try {
4519ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4529ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
453da79fbbcSStefano Zampini       if (!upTriFactor) {
4542cbc15d9SMark         PetscScalar *AAUp;
4552cbc15d9SMark 
4569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4572cbc15d9SMark 
4589ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4599566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4639ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4649ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4659ae82921SPaul Mullowney         offset  = nzUpper;
4669ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4679ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4689ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4699ae82921SPaul Mullowney 
470e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4719ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4729ae82921SPaul Mullowney 
473e057df02SPaul Mullowney           /* decrement the offset */
4749ae82921SPaul Mullowney           offset -= (nz + 1);
4759ae82921SPaul Mullowney 
476e057df02SPaul Mullowney           /* first, set the diagonal elements */
4779ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
47809f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4799ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4809ae82921SPaul Mullowney 
481f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
482f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
4839ae82921SPaul Mullowney         }
4842205254eSKarl Rupp 
485aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4869566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
487da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4882205254eSKarl Rupp 
489aa372e3fSPaul Mullowney         /* Create the matrix description */
4909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4921b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
494afb2bd1cSJunchao Zhang   #else
4959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
496afb2bd1cSJunchao Zhang   #endif
4979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4989566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
499aa372e3fSPaul Mullowney 
500aa372e3fSPaul Mullowney         /* set the operation */
501aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502aa372e3fSPaul Mullowney 
503aa372e3fSPaul Mullowney         /* set the matrix */
504aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
505aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
506aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
507aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
508aa372e3fSPaul Mullowney 
509aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
510aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
511aa372e3fSPaul Mullowney 
512aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
513aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
514aa372e3fSPaul Mullowney 
515aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
516aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
517aa372e3fSPaul Mullowney 
518afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5199566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
520261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5211b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5229371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5239371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
525afb2bd1cSJunchao Zhang   #endif
526afb2bd1cSJunchao Zhang 
527aa372e3fSPaul Mullowney         /* perform the solve analysis */
5289371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5299f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5309f7ba44dSJacob Faibussowitsch 
5319566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5329566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
533aa372e3fSPaul Mullowney 
534da79fbbcSStefano Zampini         /* assign the pointer */
535aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5362cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5379566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5389566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5399566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
540da79fbbcSStefano Zampini       } else {
54148a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
542da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
543da79fbbcSStefano Zampini         offset = nzUpper;
544da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
545da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
546da79fbbcSStefano Zampini 
547da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
548da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
549da79fbbcSStefano Zampini 
550da79fbbcSStefano Zampini           /* decrement the offset */
551da79fbbcSStefano Zampini           offset -= (nz + 1);
552da79fbbcSStefano Zampini 
553da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5542cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
555f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
556da79fbbcSStefano Zampini         }
5572cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5589566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
559da79fbbcSStefano Zampini       }
560d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
561d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
562d71ae5a4SJacob Faibussowitsch     }
5639ae82921SPaul Mullowney   }
5643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5659ae82921SPaul Mullowney }
566d460d7bfSJunchao Zhang #endif
5679ae82921SPaul Mullowney 
568d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
569d71ae5a4SJacob Faibussowitsch {
5709ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5719ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572c9e33d71SJunchao Zhang   IS                            isrow = a->row, isicol = a->icol;
5739ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5749ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5759ae82921SPaul Mullowney 
5769ae82921SPaul Mullowney   PetscFunctionBegin;
57728b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
578b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
579d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
580d460d7bfSJunchao Zhang #else
5819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
583ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
584d460d7bfSJunchao Zhang #endif
585d460d7bfSJunchao Zhang 
586aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5879ae82921SPaul Mullowney 
588d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
589e057df02SPaul Mullowney   /* lower triangular indices */
5909566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
591da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
592da79fbbcSStefano Zampini     const PetscInt *r;
593da79fbbcSStefano Zampini 
5949566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
595aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
596aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5979566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5989566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
599da79fbbcSStefano Zampini   }
6009ae82921SPaul Mullowney 
601e057df02SPaul Mullowney   /* upper triangular indices */
602c9e33d71SJunchao Zhang   PetscCall(ISIdentity(isicol, &col_identity));
603da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
604da79fbbcSStefano Zampini     const PetscInt *c;
605da79fbbcSStefano Zampini 
606c9e33d71SJunchao Zhang     PetscCall(ISGetIndices(isicol, &c));
607aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
608aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
609c9e33d71SJunchao Zhang     PetscCall(ISRestoreIndices(isicol, &c));
6109566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
611da79fbbcSStefano Zampini   }
6123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6139ae82921SPaul Mullowney }
6149ae82921SPaul Mullowney 
615b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
6165c7eeb11SPierre Jolivet static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
617d460d7bfSJunchao Zhang {
618d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
619d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
620d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
621d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
622d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
623d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
624d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
625d460d7bfSJunchao Zhang 
626d460d7bfSJunchao Zhang   PetscFunctionBegin;
627d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
628d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
629d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
630d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
631d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
632d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
633d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
634d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
635d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
636d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
637d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
638d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
639d460d7bfSJunchao Zhang       }
640d460d7bfSJunchao Zhang       // Copy M (U) from host to device
641f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
642f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
643f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
644f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
645d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
646d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
647d460d7bfSJunchao Zhang 
648d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
649d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
650d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
651d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
652d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
653d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
654d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
655d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
656d460d7bfSJunchao Zhang 
657d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
658d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
659d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
660d460d7bfSJunchao Zhang 
661d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
662f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
663f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
664d460d7bfSJunchao Zhang 
665d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
666d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
667d460d7bfSJunchao Zhang 
668d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
669d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
670d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
671d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
672d460d7bfSJunchao Zhang 
673aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
674d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
675d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
676d460d7bfSJunchao Zhang 
677d460d7bfSJunchao Zhang       // Record for reuse
678d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
679d460d7bfSJunchao Zhang       fs->diag_h   = D;
680d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
681d460d7bfSJunchao Zhang     }
682d460d7bfSJunchao Zhang     // Copy the value
683d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
684d460d7bfSJunchao Zhang     D   = fs->diag_h;
685d460d7bfSJunchao Zhang     Mnz = Ai[m];
686d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
687d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
688d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
689d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
690d460d7bfSJunchao Zhang     }
691d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
692d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
693d460d7bfSJunchao Zhang 
694204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
695204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) {
696204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
697204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
698204a0e31SJunchao Zhang     } else
699204a0e31SJunchao Zhang   #endif
700204a0e31SJunchao Zhang     {
701d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
702d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
703d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
704204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis = PETSC_TRUE;
705204a0e31SJunchao Zhang     }
706d460d7bfSJunchao Zhang   }
707d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
708d460d7bfSJunchao Zhang }
709d460d7bfSJunchao Zhang 
710d460d7bfSJunchao Zhang // Solve Ut D U x = b
711d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
712d460d7bfSJunchao Zhang {
713d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
714d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
715d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
716d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
717d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
718d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
719d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
720d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
721d460d7bfSJunchao Zhang 
722d460d7bfSJunchao Zhang   PetscFunctionBegin;
723d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
724d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
725d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
726d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
727d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
728d460d7bfSJunchao Zhang 
729d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
730d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
731d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
732d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
733d460d7bfSJunchao Zhang   } else {
734d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
735d460d7bfSJunchao Zhang   }
736d460d7bfSJunchao Zhang 
737d460d7bfSJunchao Zhang   // Solve Ut Y = X
738d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
739d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
740d460d7bfSJunchao Zhang 
741d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
742d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
743*29d3d2f8SNuno Nobre   #if CCCL_VERSION >= 3001000
744*29d3d2f8SNuno Nobre   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
745*29d3d2f8SNuno Nobre   #else
746d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
747*29d3d2f8SNuno Nobre   #endif
748d460d7bfSJunchao Zhang 
749d460d7bfSJunchao Zhang   // Solve U X = Y
750d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
751d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
752d460d7bfSJunchao Zhang   } else {
753d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
754d460d7bfSJunchao Zhang   }
755d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
756d460d7bfSJunchao Zhang 
757d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
758d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
759d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
760d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
761d460d7bfSJunchao Zhang   }
762d460d7bfSJunchao Zhang 
763d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
764d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
765d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
766d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
767d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
768d460d7bfSJunchao Zhang }
769d460d7bfSJunchao Zhang #else
770d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
771d71ae5a4SJacob Faibussowitsch {
772087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
773087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
774aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
775aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
776087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
777087f3262SPaul Mullowney   PetscScalar                       *AAUp;
778087f3262SPaul Mullowney   PetscScalar                       *AALo;
779087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
780087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
781087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
782087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
783087f3262SPaul Mullowney 
784087f3262SPaul Mullowney   PetscFunctionBegin;
7853ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
786c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
787087f3262SPaul Mullowney     try {
7889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
790da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
791087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7939566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
794087f3262SPaul Mullowney 
795087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
796087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
797087f3262SPaul Mullowney         AiUp[n] = nzUpper;
798087f3262SPaul Mullowney         offset  = 0;
799087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
800087f3262SPaul Mullowney           /* set the pointers */
801087f3262SPaul Mullowney           v  = aa + ai[i];
802087f3262SPaul Mullowney           vj = aj + ai[i];
803087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
804087f3262SPaul Mullowney 
805087f3262SPaul Mullowney           /* first, set the diagonal elements */
806087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
80709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
808087f3262SPaul Mullowney           AiUp[i]      = offset;
80909f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
810087f3262SPaul Mullowney 
811087f3262SPaul Mullowney           offset += 1;
812087f3262SPaul Mullowney           if (nz > 0) {
813f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
814f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
815087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
816087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
817087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
818087f3262SPaul Mullowney             }
819087f3262SPaul Mullowney             offset += nz;
820087f3262SPaul Mullowney           }
821087f3262SPaul Mullowney         }
822087f3262SPaul Mullowney 
823aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8249566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
825da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826087f3262SPaul Mullowney 
827aa372e3fSPaul Mullowney         /* Create the matrix description */
8289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8301b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
832afb2bd1cSJunchao Zhang   #else
8339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
834afb2bd1cSJunchao Zhang   #endif
8359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
837087f3262SPaul Mullowney 
838aa372e3fSPaul Mullowney         /* set the matrix */
839aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
841aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
842aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
852aa372e3fSPaul Mullowney 
853afb2bd1cSJunchao Zhang         /* set the operation */
854afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
855afb2bd1cSJunchao Zhang 
856afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8579566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
858261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8591b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8609371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8619371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
863afb2bd1cSJunchao Zhang   #endif
864afb2bd1cSJunchao Zhang 
865aa372e3fSPaul Mullowney         /* perform the solve analysis */
8669371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8679f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8689f7ba44dSJacob Faibussowitsch 
8699566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8709566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
871aa372e3fSPaul Mullowney 
872da79fbbcSStefano Zampini         /* assign the pointer */
873aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
874aa372e3fSPaul Mullowney 
875aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8769566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
877da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
878aa372e3fSPaul Mullowney 
879aa372e3fSPaul Mullowney         /* Create the matrix description */
8809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8821b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
884afb2bd1cSJunchao Zhang   #else
8859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
886afb2bd1cSJunchao Zhang   #endif
8879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
889aa372e3fSPaul Mullowney 
890aa372e3fSPaul Mullowney         /* set the operation */
891aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
892aa372e3fSPaul Mullowney 
893aa372e3fSPaul Mullowney         /* set the matrix */
894aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
895aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
896aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
897aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
898aa372e3fSPaul Mullowney 
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
900aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
904aa372e3fSPaul Mullowney 
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
907aa372e3fSPaul Mullowney 
908afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9099566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
910261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9111b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9129371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9139371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9149566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
915afb2bd1cSJunchao Zhang   #endif
916afb2bd1cSJunchao Zhang 
917aa372e3fSPaul Mullowney         /* perform the solve analysis */
9189371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9199f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9209f7ba44dSJacob Faibussowitsch 
9219566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9229566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
923aa372e3fSPaul Mullowney 
924da79fbbcSStefano Zampini         /* assign the pointer */
925aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
926087f3262SPaul Mullowney 
9279566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
930da79fbbcSStefano Zampini       } else {
931da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
932da79fbbcSStefano Zampini         offset = 0;
933da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
934da79fbbcSStefano Zampini           /* set the pointers */
935da79fbbcSStefano Zampini           v  = aa + ai[i];
936da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
937da79fbbcSStefano Zampini 
938da79fbbcSStefano Zampini           /* first, set the diagonal elements */
939da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
940da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
941da79fbbcSStefano Zampini 
942da79fbbcSStefano Zampini           offset += 1;
943da79fbbcSStefano Zampini           if (nz > 0) {
944f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
945da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
946da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
947da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
948da79fbbcSStefano Zampini             }
949da79fbbcSStefano Zampini             offset += nz;
950da79fbbcSStefano Zampini           }
951da79fbbcSStefano Zampini         }
95228b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
95328b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
954da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
955da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9569566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
957da79fbbcSStefano Zampini       }
9589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9599566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
960d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
961d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
962d71ae5a4SJacob Faibussowitsch     }
963087f3262SPaul Mullowney   }
9643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
965087f3262SPaul Mullowney }
966d460d7bfSJunchao Zhang #endif
967087f3262SPaul Mullowney 
968d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
969d71ae5a4SJacob Faibussowitsch {
970087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
971087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
972087f3262SPaul Mullowney   IS                            ip                 = a->row;
973087f3262SPaul Mullowney   PetscBool                     perm_identity;
974087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
975087f3262SPaul Mullowney 
976087f3262SPaul Mullowney   PetscFunctionBegin;
97728b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
978d460d7bfSJunchao Zhang 
979b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
9805c7eeb11SPierre Jolivet   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
981d460d7bfSJunchao Zhang #else
9829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
983ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
984d460d7bfSJunchao Zhang #endif
985aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
986aa372e3fSPaul Mullowney 
987da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
988da79fbbcSStefano Zampini 
989087f3262SPaul Mullowney   /* lower triangular indices */
9909566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
991087f3262SPaul Mullowney   if (!perm_identity) {
9924e4bbfaaSStefano Zampini     IS              iip;
993da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9944e4bbfaaSStefano Zampini 
9959566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
9969566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
9979566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
998aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
999aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1000aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10014e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10029566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10039566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10049566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10059566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1006da79fbbcSStefano Zampini   }
10073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1008087f3262SPaul Mullowney }
1009087f3262SPaul Mullowney 
1010d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1011d71ae5a4SJacob Faibussowitsch {
1012087f3262SPaul Mullowney   PetscFunctionBegin;
10139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10149566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1015ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1016d460d7bfSJunchao Zhang 
1017b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1018d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1019d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1020d460d7bfSJunchao Zhang #else
1021087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1022d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1023d460d7bfSJunchao Zhang   IS          ip = b->row;
1024d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1025d460d7bfSJunchao Zhang 
10269566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1027087f3262SPaul Mullowney   if (perm_identity) {
1028087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1029087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1030087f3262SPaul Mullowney   } else {
1031087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1032087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1033d460d7bfSJunchao Zhang   }
1034d460d7bfSJunchao Zhang #endif
10354e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10364e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1037087f3262SPaul Mullowney 
1038087f3262SPaul Mullowney   /* get the triangular factors */
10399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1041087f3262SPaul Mullowney }
10429ae82921SPaul Mullowney 
1043b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1044d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1045d71ae5a4SJacob Faibussowitsch {
1046bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1047aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1048aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1049da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1050da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1051aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1052aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1053aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1054aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1055b175d8bbSPaul Mullowney 
1056bda325fcSPaul Mullowney   PetscFunctionBegin;
1057aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10589566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1059da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1060aa372e3fSPaul Mullowney 
1061aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1062aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1063aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10649371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1065aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1066aa372e3fSPaul Mullowney 
1067aa372e3fSPaul Mullowney   /* Create the matrix description */
10689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10699566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10709566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10729566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1073aa372e3fSPaul Mullowney 
1074aa372e3fSPaul Mullowney   /* set the operation */
1075aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1076aa372e3fSPaul Mullowney 
1077aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1078aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1079afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1080afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1081aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1082afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1083afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1084afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1085aa372e3fSPaul Mullowney 
1086aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1087afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10889371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10899371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10909371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10919566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1092afb2bd1cSJunchao Zhang   #endif
1093afb2bd1cSJunchao Zhang 
10949566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10959f7ba44dSJacob Faibussowitsch   {
10969f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
10979f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
10989371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1099afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11009f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1101afb2bd1cSJunchao Zhang   #else
11029f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1103afb2bd1cSJunchao Zhang   #endif
11049f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11059f7ba44dSJacob Faibussowitsch   }
11069f7ba44dSJacob Faibussowitsch 
11079566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11089566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1109aa372e3fSPaul Mullowney 
1110afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1112261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11131b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11149371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11159371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1117afb2bd1cSJunchao Zhang   #endif
1118afb2bd1cSJunchao Zhang 
1119afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11209371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11219f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11229f7ba44dSJacob Faibussowitsch 
11239566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11249566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1125aa372e3fSPaul Mullowney 
1126da79fbbcSStefano Zampini   /* assign the pointer */
1127aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1128aa372e3fSPaul Mullowney 
1129aa372e3fSPaul Mullowney   /*********************************************/
1130aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1131aa372e3fSPaul Mullowney   /*********************************************/
1132aa372e3fSPaul Mullowney 
1133aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11349566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1135da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1136aa372e3fSPaul Mullowney 
1137aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1138aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1139aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11409371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1141aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1142aa372e3fSPaul Mullowney 
1143aa372e3fSPaul Mullowney   /* Create the matrix description */
11449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1149aa372e3fSPaul Mullowney 
1150aa372e3fSPaul Mullowney   /* set the operation */
1151aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1152aa372e3fSPaul Mullowney 
1153aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1154aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1155afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1156afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1157aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1158afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1159afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1160afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1161aa372e3fSPaul Mullowney 
1162aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1163afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11649371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11659371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11669371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11679566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1168afb2bd1cSJunchao Zhang   #endif
1169afb2bd1cSJunchao Zhang 
11709566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11719f7ba44dSJacob Faibussowitsch   {
11729f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11739f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11749371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1175afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11769f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1177afb2bd1cSJunchao Zhang   #else
11789f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1179afb2bd1cSJunchao Zhang   #endif
11809f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11819f7ba44dSJacob Faibussowitsch   }
1182d49cd2b7SBarry Smith 
11839566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1185aa372e3fSPaul Mullowney 
1186afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11879566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1188261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11891b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11909371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11919371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11929566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1193afb2bd1cSJunchao Zhang   #endif
1194afb2bd1cSJunchao Zhang 
1195afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11965f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
11979371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11989f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1199d49cd2b7SBarry Smith 
12009566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12019566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1202aa372e3fSPaul Mullowney 
1203da79fbbcSStefano Zampini   /* assign the pointer */
1204aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1206bda325fcSPaul Mullowney }
1207d460d7bfSJunchao Zhang #endif
1208bda325fcSPaul Mullowney 
12099371c9d4SSatish Balay struct PetscScalarToPetscInt {
12109371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1211a49f1ed0SStefano Zampini };
1212a49f1ed0SStefano Zampini 
1213d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1214d71ae5a4SJacob Faibussowitsch {
1215aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1216a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1217bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1218bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1219aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1220b175d8bbSPaul Mullowney 
1221bda325fcSPaul Mullowney   PetscFunctionBegin;
12229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1223a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122428b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1225a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
122608401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12273ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12289566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
123048a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1231a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1232aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12339566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1234aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12359566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1237aa372e3fSPaul Mullowney 
1238b06137fdSPaul Mullowney     /* set alpha and beta */
1239f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1240f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1241f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1245b06137fdSPaul Mullowney 
1246aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1248a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1249554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1250554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1251aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1252a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1253aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1254aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1255a3fdcf43SKarl Rupp 
1256ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
125781902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1258afb2bd1cSJunchao Zhang 
1259afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12603606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12619371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12629371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12639371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12643606e59fSJunchao Zhang   #else
12653606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12663606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12673606e59fSJunchao Zhang 
12683606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12693606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12703606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12713606e59fSJunchao Zhang         */
12723606e59fSJunchao Zhang       if (matrixT->num_entries) {
12739371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12749371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12753606e59fSJunchao Zhang 
12763606e59fSJunchao Zhang       } else {
12773606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12783606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12793606e59fSJunchao Zhang       }
12803606e59fSJunchao Zhang   #endif
1281afb2bd1cSJunchao Zhang #endif
1282aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1283afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1284afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1285afb2bd1cSJunchao Zhang #else
1286aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
128751c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
128851c6d536SStefano Zampini       /* First convert HYB to CSR */
1289aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1290aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1291aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1292aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1293aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1294aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1295aa372e3fSPaul Mullowney 
12969371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
12979371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1298aa372e3fSPaul Mullowney 
1299aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1300aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1301aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1302aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1303aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1304aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1305aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1306aa372e3fSPaul Mullowney 
13079371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13089371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13099371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1310aa372e3fSPaul Mullowney 
1311aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1312aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13139566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13149371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13159371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13169371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1317aa372e3fSPaul Mullowney 
1318aa372e3fSPaul Mullowney       /* assign the pointer */
1319aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13201a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1321aa372e3fSPaul Mullowney       /* delete temporaries */
1322aa372e3fSPaul Mullowney       if (tempT) {
1323aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1324aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1325aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1326aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1327087f3262SPaul Mullowney       }
1328aa372e3fSPaul Mullowney       if (temp) {
1329aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1330aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1331aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1332aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1333aa372e3fSPaul Mullowney       }
1334afb2bd1cSJunchao Zhang #endif
1335aa372e3fSPaul Mullowney     }
1336a49f1ed0SStefano Zampini   }
1337a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1338a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1339a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
134028b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
134128b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
134228b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
134328b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134428b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134528b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
134728b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1348a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1349a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1350a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13519566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1352a49f1ed0SStefano Zampini     }
1353a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1354a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1355792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1356a49f1ed0SStefano Zampini 
1357a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1358a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1359a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1360a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13619371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13629371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13639371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13649566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1365a49f1ed0SStefano Zampini #endif
1366a49f1ed0SStefano Zampini 
13671a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13681a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13691a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13701a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13711a2c6b5cSJunchao Zhang 
13721a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13731a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13741a2c6b5cSJunchao Zhang         */
13759371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1376a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13779371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13789371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1379a49f1ed0SStefano Zampini #else
13809371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13819371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1382a49f1ed0SStefano Zampini #endif
13831a2c6b5cSJunchao Zhang       } else {
13841a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13851a2c6b5cSJunchao Zhang       }
13861a2c6b5cSJunchao Zhang 
1387a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1388792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1389a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1391a49f1ed0SStefano Zampini #endif
1392a49f1ed0SStefano Zampini     }
13939371c9d4SSatish Balay     PetscCallThrust(
13949371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1395a49f1ed0SStefano Zampini   }
13969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13979566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1398213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1399213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1400aa372e3fSPaul Mullowney   /* assign the pointer */
1401aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14021a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1404bda325fcSPaul Mullowney }
1405bda325fcSPaul Mullowney 
1406b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1407d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1408d460d7bfSJunchao Zhang {
1409d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1410d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1411d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1412d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1413d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1414d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1415d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1416d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1417d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1418d460d7bfSJunchao Zhang 
1419d460d7bfSJunchao Zhang   PetscFunctionBegin;
1420d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1421d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1422d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1423d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1424d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1425d460d7bfSJunchao Zhang 
1426d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1427d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1428d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1429d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1430d460d7bfSJunchao Zhang   } else {
1431d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1432d460d7bfSJunchao Zhang   }
1433d460d7bfSJunchao Zhang 
1434d460d7bfSJunchao Zhang   // Solve L Y = X
1435d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1436d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1437d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1438d460d7bfSJunchao Zhang 
1439d460d7bfSJunchao Zhang   // Solve U X = Y
1440d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1441d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1442d460d7bfSJunchao Zhang   } else {
1443d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1444d460d7bfSJunchao Zhang   }
1445d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1446d460d7bfSJunchao Zhang 
1447d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1448d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1449d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1450d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1451d460d7bfSJunchao Zhang   }
1452d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1453d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1454d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1455d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1456d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1457d460d7bfSJunchao Zhang }
1458d460d7bfSJunchao Zhang 
1459d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1460d460d7bfSJunchao Zhang {
1461d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1462d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1463d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1464d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1465d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1466d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1467d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1468d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1469d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1470d460d7bfSJunchao Zhang 
1471d460d7bfSJunchao Zhang   PetscFunctionBegin;
1472d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1473d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1474d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1475d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1476d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1477d460d7bfSJunchao Zhang 
1478d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1479d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1480d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1481d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1482d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1483d460d7bfSJunchao Zhang   }
1484d460d7bfSJunchao Zhang 
1485d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1486d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1487d460d7bfSJunchao Zhang 
1488d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1489d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1490d460d7bfSJunchao Zhang   }
1491d460d7bfSJunchao Zhang 
1492d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1493d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1494d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1495d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1496d460d7bfSJunchao Zhang 
1497d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1498d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1499d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1500d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1501d460d7bfSJunchao Zhang   } else {
1502d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1503d460d7bfSJunchao Zhang   }
1504d460d7bfSJunchao Zhang 
1505d460d7bfSJunchao Zhang   // Solve Ut Y = X
1506d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1507d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1508d460d7bfSJunchao Zhang 
1509d460d7bfSJunchao Zhang   // Solve Lt X = Y
1510d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1511d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1512d460d7bfSJunchao Zhang   } else {
1513d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1514d460d7bfSJunchao Zhang   }
1515d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1516d460d7bfSJunchao Zhang 
1517d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1518d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1519d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1520d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1521d460d7bfSJunchao Zhang   }
1522d460d7bfSJunchao Zhang 
1523d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1524d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1525d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1526d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1527d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1528d460d7bfSJunchao Zhang }
1529d460d7bfSJunchao Zhang #else
1530a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1531d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1532d71ae5a4SJacob Faibussowitsch {
1533c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1534465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1535465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1536465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1537465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1538bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1539aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1540aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1541aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1542bda325fcSPaul Mullowney 
1543bda325fcSPaul Mullowney   PetscFunctionBegin;
1544aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1545aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15469566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1547aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1548aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1549bda325fcSPaul Mullowney   }
1550bda325fcSPaul Mullowney 
1551bda325fcSPaul Mullowney   /* Get the GPU pointers */
15529566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15539566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1554c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1555c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1556bda325fcSPaul Mullowney 
15579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1558aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15599371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1560aa372e3fSPaul Mullowney 
1561aa372e3fSPaul Mullowney   /* First, solve U */
15629f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15639f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1564aa372e3fSPaul Mullowney 
1565aa372e3fSPaul Mullowney   /* Then, solve L */
15669f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15679f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1568aa372e3fSPaul Mullowney 
1569aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15709371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1571aa372e3fSPaul Mullowney 
1572aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1573a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1574bda325fcSPaul Mullowney 
1575bda325fcSPaul Mullowney   /* restore */
15769566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15779566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1581bda325fcSPaul Mullowney }
1582bda325fcSPaul Mullowney 
1583d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1584d71ae5a4SJacob Faibussowitsch {
1585465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1586465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1587bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1588aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1589aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1590aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1591bda325fcSPaul Mullowney 
1592bda325fcSPaul Mullowney   PetscFunctionBegin;
1593aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1594aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1596aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1597aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1598bda325fcSPaul Mullowney   }
1599bda325fcSPaul Mullowney 
1600bda325fcSPaul Mullowney   /* Get the GPU pointers */
16019566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16029566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1603bda325fcSPaul Mullowney 
16049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1605aa372e3fSPaul Mullowney   /* First, solve U */
16069f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16079f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1608aa372e3fSPaul Mullowney 
1609aa372e3fSPaul Mullowney   /* Then, solve L */
16109f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16119f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1612bda325fcSPaul Mullowney 
1613bda325fcSPaul Mullowney   /* restore */
16149566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16159566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1619bda325fcSPaul Mullowney }
1620bda325fcSPaul Mullowney 
1621d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1622d71ae5a4SJacob Faibussowitsch {
1623465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1624465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1625465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1626465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16279ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1628aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1629aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1630aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16319ae82921SPaul Mullowney 
16329ae82921SPaul Mullowney   PetscFunctionBegin;
1633e057df02SPaul Mullowney   /* Get the GPU pointers */
16349566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16359566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1636c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1637c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16389ae82921SPaul Mullowney 
16399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1640aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16419371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1642aa372e3fSPaul Mullowney 
1643aa372e3fSPaul Mullowney   /* Next, solve L */
16449f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16459f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1646aa372e3fSPaul Mullowney 
1647aa372e3fSPaul Mullowney   /* Then, solve U */
16489f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16499f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1650d49cd2b7SBarry Smith 
16514e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16529371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16539ae82921SPaul Mullowney 
16549566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16559566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16599ae82921SPaul Mullowney }
16609ae82921SPaul Mullowney 
1661d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1662d71ae5a4SJacob Faibussowitsch {
1663465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1664465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16659ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1666aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1667aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1668aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16699ae82921SPaul Mullowney 
16709ae82921SPaul Mullowney   PetscFunctionBegin;
1671e057df02SPaul Mullowney   /* Get the GPU pointers */
16729566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16739566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16749ae82921SPaul Mullowney 
16759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1676aa372e3fSPaul Mullowney   /* First, solve L */
16779f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16789f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1679d49cd2b7SBarry Smith 
1680aa372e3fSPaul Mullowney   /* Next, solve U */
16819f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16829f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16839ae82921SPaul Mullowney 
16849566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16859566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16879566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16899ae82921SPaul Mullowney }
1690d460d7bfSJunchao Zhang #endif
16919ae82921SPaul Mullowney 
1692b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
16938eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1694d71ae5a4SJacob Faibussowitsch {
1695da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1696da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1697da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1698da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1699da112707SJunchao Zhang   PetscInt                      m, nz;
1700da112707SJunchao Zhang   PetscBool                     flg;
1701da112707SJunchao Zhang 
1702da112707SJunchao Zhang   PetscFunctionBegin;
1703da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1704da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1705da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1706da112707SJunchao Zhang   }
1707da112707SJunchao Zhang 
1708da112707SJunchao Zhang   /* Copy A's value to fact */
1709da112707SJunchao Zhang   m  = fact->rmap->n;
1710da112707SJunchao Zhang   nz = aij->nz;
1711da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1712da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1713da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1714da112707SJunchao Zhang 
1715bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1716da112707SJunchao Zhang   /* Factorize fact inplace */
17179371c9d4SSatish Balay   if (m)
17189371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1719d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1720da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1721da112707SJunchao Zhang     int              numerical_zero;
1722da112707SJunchao Zhang     cusparseStatus_t status;
1723da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1724da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1725da112707SJunchao Zhang   }
1726da112707SJunchao Zhang 
1727204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1728204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1729204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1730204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1731204a0e31SJunchao Zhang   } else
1732204a0e31SJunchao Zhang   #endif
1733204a0e31SJunchao Zhang   {
173412ba2bc6SJunchao Zhang     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173512ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173612ba2bc6SJunchao Zhang     */
17379371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1738da112707SJunchao Zhang 
17399371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1740da112707SJunchao Zhang 
1741204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
174212ba2bc6SJunchao Zhang     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
174312ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1744204a0e31SJunchao Zhang   }
174512ba2bc6SJunchao Zhang 
1746da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1747d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1748d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1749da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1750da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1751bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1752da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1754da112707SJunchao Zhang }
1755da112707SJunchao Zhang 
17568eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1757d71ae5a4SJacob Faibussowitsch {
1758da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1759da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1760da112707SJunchao Zhang   PetscInt                      m, nz;
1761da112707SJunchao Zhang 
1762da112707SJunchao Zhang   PetscFunctionBegin;
1763da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1764da112707SJunchao Zhang     PetscInt  i;
1765da112707SJunchao Zhang     PetscBool flg, missing;
1766da112707SJunchao Zhang 
1767da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1768da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1769da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1770da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1771da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1772da112707SJunchao Zhang   }
1773da112707SJunchao Zhang 
1774da112707SJunchao Zhang   /* Free the old stale stuff */
1775da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1776da112707SJunchao Zhang 
1777da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1778da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1779da112707SJunchao Zhang    */
1780da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1781da112707SJunchao Zhang 
1782da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1783da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1784da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1785da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1786da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1787da112707SJunchao Zhang 
1788da112707SJunchao Zhang   aij->row = NULL;
1789da112707SJunchao Zhang   aij->col = NULL;
1790da112707SJunchao Zhang 
1791da112707SJunchao Zhang   /* ====================================================================== */
1792da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1793da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1794da112707SJunchao Zhang   /* ====================================================================== */
1795da112707SJunchao Zhang   const int *Ai, *Aj;
1796da112707SJunchao Zhang 
1797da112707SJunchao Zhang   m  = fact->rmap->n;
1798da112707SJunchao Zhang   nz = aij->nz;
1799da112707SJunchao Zhang 
1800f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1801f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1802f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1803d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1804d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1805d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1806da112707SJunchao Zhang 
1807da112707SJunchao Zhang   /* ====================================================================== */
1808da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1809da112707SJunchao Zhang   /* ====================================================================== */
1810da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1811da112707SJunchao Zhang   cusparseDiagType_t diagType;
1812da112707SJunchao Zhang 
1813da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1814da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1815da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1816da112707SJunchao Zhang 
1817da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1818da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1819da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1820da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1821da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1822da112707SJunchao Zhang   */
1823da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1824da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1825d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18269371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18279371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1828da112707SJunchao Zhang 
1829da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1830da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1831d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18329371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18339371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1834da112707SJunchao Zhang 
1835da112707SJunchao Zhang   /* ========================================================================= */
1836da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1837da112707SJunchao Zhang   /* ========================================================================= */
1838da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18399371c9d4SSatish Balay   if (m)
18409371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1841d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1842da112707SJunchao Zhang 
1843da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1844da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1845da112707SJunchao Zhang 
1846da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1847da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1848da112707SJunchao Zhang 
1849da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18509371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1851da112707SJunchao Zhang 
1852da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18539371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1854da112707SJunchao Zhang 
1855da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
185612ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
185712ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
185812ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1859da112707SJunchao Zhang    */
186012ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
186112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
186212ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1863da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
186412ba2bc6SJunchao Zhang   } else {
186512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
186612ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1867da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
186812ba2bc6SJunchao Zhang   }
1869da112707SJunchao Zhang 
1870da112707SJunchao Zhang   /* ========================================================================== */
1871da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1872da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1873da112707SJunchao Zhang   /* ========================================================================== */
1874da112707SJunchao Zhang   int              structural_zero;
1875da112707SJunchao Zhang   cusparseStatus_t status;
1876da112707SJunchao Zhang 
1877da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18789371c9d4SSatish Balay   if (m)
18799371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1880d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1881da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
188246aba097SBarry Smith     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1883da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1884da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1885da112707SJunchao Zhang   }
1886da112707SJunchao Zhang 
1887da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18880dd8c0acSJunchao Zhang   {
1889da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18900dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1891da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1892da112707SJunchao Zhang 
1893da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1894da112707SJunchao Zhang     Ai    = Aseq->i;
1895da112707SJunchao Zhang     Adiag = Aseq->diag;
1896da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1897da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1898da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1899da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1900da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1901da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1902da112707SJunchao Zhang         */
1903da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1904da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1905da112707SJunchao Zhang       }
1906da112707SJunchao Zhang     }
1907da112707SJunchao Zhang     fs->numericFactFlops = flops;
19080dd8c0acSJunchao Zhang   }
1909da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1911da112707SJunchao Zhang }
1912da112707SJunchao Zhang 
1913d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1914d71ae5a4SJacob Faibussowitsch {
1915da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1916da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1917da112707SJunchao Zhang   const PetscScalar            *barray;
1918da112707SJunchao Zhang   PetscScalar                  *xarray;
1919da112707SJunchao Zhang 
1920da112707SJunchao Zhang   PetscFunctionBegin;
1921da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1922da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1923da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1924da112707SJunchao Zhang 
1925da112707SJunchao Zhang   /* Solve L*y = b */
1926da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1927da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19299371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1930da112707SJunchao Zhang 
1931da112707SJunchao Zhang   /* Solve Lt*x = y */
1932da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19339371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19349371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1935da112707SJunchao Zhang 
1936da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1937da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1938da112707SJunchao Zhang 
1939da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1940da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1942da112707SJunchao Zhang }
1943da112707SJunchao Zhang 
19448eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1945d71ae5a4SJacob Faibussowitsch {
1946da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1947da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1948da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1949da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1950da112707SJunchao Zhang   PetscInt                      m, nz;
1951da112707SJunchao Zhang   PetscBool                     flg;
1952da112707SJunchao Zhang 
1953da112707SJunchao Zhang   PetscFunctionBegin;
1954da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1955da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1956da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1957da112707SJunchao Zhang   }
1958da112707SJunchao Zhang 
1959da112707SJunchao Zhang   /* Copy A's value to fact */
1960da112707SJunchao Zhang   m  = fact->rmap->n;
1961da112707SJunchao Zhang   nz = aij->nz;
1962da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1963da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1964da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1965da112707SJunchao Zhang 
1966da112707SJunchao Zhang   /* Factorize fact inplace */
1967da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
196846aba097SBarry Smith      csric02() only takes the lower triangular part of matrix A to perform factorization.
1969da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1970da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1971da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1972da112707SJunchao Zhang    */
1973d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1974da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1975da112707SJunchao Zhang     int              numerical_zero;
1976da112707SJunchao Zhang     cusparseStatus_t status;
1977da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1978da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1979da112707SJunchao Zhang   }
1980da112707SJunchao Zhang 
1981204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1982204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1983204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1984204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1985204a0e31SJunchao Zhang   } else
1986204a0e31SJunchao Zhang   #endif
1987204a0e31SJunchao Zhang   {
19889371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1989da112707SJunchao Zhang 
1990da112707SJunchao Zhang     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1991da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1992da112707SJunchao Zhang   */
19939371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1994204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
1995204a0e31SJunchao Zhang   }
1996da112707SJunchao Zhang 
1997da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1998da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1999da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2000da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
2001da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
2002da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
20033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2004da112707SJunchao Zhang }
2005da112707SJunchao Zhang 
20068eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2007d71ae5a4SJacob Faibussowitsch {
2008da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2009da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2010da112707SJunchao Zhang   PetscInt                      m, nz;
2011da112707SJunchao Zhang 
2012da112707SJunchao Zhang   PetscFunctionBegin;
2013da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2014da112707SJunchao Zhang     PetscInt  i;
2015da112707SJunchao Zhang     PetscBool flg, missing;
2016da112707SJunchao Zhang 
2017da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2018da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2019da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2020da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2021da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2022da112707SJunchao Zhang   }
2023da112707SJunchao Zhang 
2024da112707SJunchao Zhang   /* Free the old stale stuff */
2025da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2026da112707SJunchao Zhang 
2027da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2028da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2029da112707SJunchao Zhang    */
2030da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2031da112707SJunchao Zhang 
2032da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2033da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2034da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2035da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2036da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2037da112707SJunchao Zhang 
2038da112707SJunchao Zhang   aij->row = NULL;
2039da112707SJunchao Zhang   aij->col = NULL;
2040da112707SJunchao Zhang 
2041da112707SJunchao Zhang   /* ====================================================================== */
2042da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2043da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2044da112707SJunchao Zhang   /* ====================================================================== */
2045da112707SJunchao Zhang   const int *Ai, *Aj;
2046da112707SJunchao Zhang 
2047da112707SJunchao Zhang   m  = fact->rmap->n;
2048da112707SJunchao Zhang   nz = aij->nz;
2049da112707SJunchao Zhang 
2050f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2051f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2052da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2053da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2054d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2055d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2056da112707SJunchao Zhang 
2057da112707SJunchao Zhang   /* ====================================================================== */
2058da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2059da112707SJunchao Zhang   /* ====================================================================== */
2060da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2061da112707SJunchao Zhang   cusparseDiagType_t diagType;
2062da112707SJunchao Zhang 
2063da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2064da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2065da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2066da112707SJunchao Zhang 
2067da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2068da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2069da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2070da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2071da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2072da112707SJunchao Zhang   */
2073da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2074da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2075d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20769371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2078da112707SJunchao Zhang 
2079da112707SJunchao Zhang   /* ========================================================================= */
2080da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2081da112707SJunchao Zhang   /* ========================================================================= */
2082da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2083d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2084da112707SJunchao Zhang 
2085da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2086da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2087da112707SJunchao Zhang 
2088da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2089da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2090da112707SJunchao Zhang 
2091da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20929371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2093da112707SJunchao Zhang 
2094da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20959371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2096da112707SJunchao Zhang 
209712ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
209812ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
209912ba2bc6SJunchao Zhang    */
210012ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
210112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
210212ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2103da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
210412ba2bc6SJunchao Zhang   } else {
210512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
210612ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
210712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
210812ba2bc6SJunchao Zhang   }
2109da112707SJunchao Zhang 
2110da112707SJunchao Zhang   /* ========================================================================== */
2111da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2112da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2113da112707SJunchao Zhang   /* ========================================================================== */
2114da112707SJunchao Zhang   int              structural_zero;
2115da112707SJunchao Zhang   cusparseStatus_t status;
2116da112707SJunchao Zhang 
2117da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2118d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2119da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
212046aba097SBarry Smith     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2121da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2122da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2123da112707SJunchao Zhang   }
2124da112707SJunchao Zhang 
2125da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21260dd8c0acSJunchao Zhang   {
2127da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21280dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2129da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2130da112707SJunchao Zhang 
2131da112707SJunchao Zhang     Ai = Aseq->i;
2132da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2133da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2134da112707SJunchao Zhang       if (nzRow > 1) {
2135da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2136da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2137da112707SJunchao Zhang         */
2138da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2139da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2140da112707SJunchao Zhang       }
2141da112707SJunchao Zhang     }
2142da112707SJunchao Zhang     fs->numericFactFlops = flops;
21430dd8c0acSJunchao Zhang   }
2144da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2146da112707SJunchao Zhang }
2147da112707SJunchao Zhang #endif
2148da112707SJunchao Zhang 
2149d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2150d460d7bfSJunchao Zhang {
2151b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2152b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2153d460d7bfSJunchao Zhang 
2154d460d7bfSJunchao Zhang   PetscFunctionBegin;
2155d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2156d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2157d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2158d460d7bfSJunchao Zhang 
2159d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2160b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2161d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2162d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2163d460d7bfSJunchao Zhang #else
2164d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2165d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2166d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2167d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2168d460d7bfSJunchao Zhang 
2169d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2170d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2171d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2172d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2173d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2174d460d7bfSJunchao Zhang     } else {
2175d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2176d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2177d460d7bfSJunchao Zhang     }
2178d460d7bfSJunchao Zhang #endif
2179d460d7bfSJunchao Zhang   }
2180d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2181d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2182d460d7bfSJunchao Zhang 
2183d460d7bfSJunchao Zhang   /* get the triangular factors */
2184d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2185d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2186d460d7bfSJunchao Zhang }
2187d460d7bfSJunchao Zhang 
2188d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2189d460d7bfSJunchao Zhang {
2190d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2191d460d7bfSJunchao Zhang 
2192d460d7bfSJunchao Zhang   PetscFunctionBegin;
2193d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2194d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2195d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2196d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2197d460d7bfSJunchao Zhang }
2198d460d7bfSJunchao Zhang 
2199d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2200d71ae5a4SJacob Faibussowitsch {
2201da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2202da112707SJunchao Zhang 
2203da112707SJunchao Zhang   PetscFunctionBegin;
2204b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2205bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2206f82ac72cSJunchao Zhang   if (!info->factoronhost) {
2207da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2208da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2209bc996fdcSJunchao Zhang   }
2210da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2211da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2212da112707SJunchao Zhang   } else
2213da112707SJunchao Zhang #endif
2214da112707SJunchao Zhang   {
2215da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2216da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2217da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2218da112707SJunchao Zhang   }
22193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2220da112707SJunchao Zhang }
2221da112707SJunchao Zhang 
2222d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2223d71ae5a4SJacob Faibussowitsch {
2224da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2225da112707SJunchao Zhang 
2226da112707SJunchao Zhang   PetscFunctionBegin;
2227b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2229f82ac72cSJunchao Zhang   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2230da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2231da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2232da112707SJunchao Zhang   } else
2233da112707SJunchao Zhang #endif
2234da112707SJunchao Zhang   {
2235da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2236da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2237da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2238da112707SJunchao Zhang   }
22393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2240da112707SJunchao Zhang }
2241da112707SJunchao Zhang 
2242d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2243d71ae5a4SJacob Faibussowitsch {
2244da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2245da112707SJunchao Zhang 
2246da112707SJunchao Zhang   PetscFunctionBegin;
2247da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2248da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2249da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2251da112707SJunchao Zhang }
2252da112707SJunchao Zhang 
225366976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2254d71ae5a4SJacob Faibussowitsch {
2255841d4cb1SJunchao Zhang   PetscFunctionBegin;
2256841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2258841d4cb1SJunchao Zhang }
2259841d4cb1SJunchao Zhang 
2260841d4cb1SJunchao Zhang /*MC
2261841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
226211a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2263841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2264841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
226511a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2266841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2267841d4cb1SJunchao Zhang 
2268841d4cb1SJunchao Zhang   Level: beginner
2269841d4cb1SJunchao Zhang 
22701cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22712ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2272841d4cb1SJunchao Zhang M*/
2273841d4cb1SJunchao Zhang 
2274d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2275d71ae5a4SJacob Faibussowitsch {
2276841d4cb1SJunchao Zhang   PetscInt n = A->rmap->n;
2277841d4cb1SJunchao Zhang 
2278841d4cb1SJunchao Zhang   PetscFunctionBegin;
2279841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2280841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2281b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2282841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2283841d4cb1SJunchao Zhang 
2284841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2285841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2286841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2287841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2288841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2289841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2290841d4cb1SJunchao Zhang     } else {
2291841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2292841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2293841d4cb1SJunchao Zhang     }
2294841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2295841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2296841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2297841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2298841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2299841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2300841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2301841d4cb1SJunchao Zhang     } else {
2302841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2303841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2304841d4cb1SJunchao Zhang     }
2305841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2306841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2307841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2308841d4cb1SJunchao Zhang 
2309841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2310841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2311f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2313841d4cb1SJunchao Zhang }
2314841d4cb1SJunchao Zhang 
2315d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2316d71ae5a4SJacob Faibussowitsch {
23177e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23187e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2319b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2320da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23210dd8c0acSJunchao Zhang #endif
23227e8381f9SStefano Zampini 
23237e8381f9SStefano Zampini   PetscFunctionBegin;
23247e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23259566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2326da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2327da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23289566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329da112707SJunchao Zhang     }
2330b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2331da112707SJunchao Zhang     else if (fs->csrVal) {
2332da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2333da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2334da112707SJunchao Zhang     }
2335da112707SJunchao Zhang #endif
23369371c9d4SSatish Balay     else
23379371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23389566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23399566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23407e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23417e8381f9SStefano Zampini   }
23423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23437e8381f9SStefano Zampini }
23447e8381f9SStefano Zampini 
2345d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2346d71ae5a4SJacob Faibussowitsch {
23477e8381f9SStefano Zampini   PetscFunctionBegin;
23489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
234967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235167a45760SJunchao Zhang }
235267a45760SJunchao Zhang 
2353d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2354d71ae5a4SJacob Faibussowitsch {
235567a45760SJunchao Zhang   PetscFunctionBegin;
23567e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235767a45760SJunchao Zhang   *array         = NULL;
23583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235967a45760SJunchao Zhang }
236067a45760SJunchao Zhang 
2361d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2362d71ae5a4SJacob Faibussowitsch {
236367a45760SJunchao Zhang   PetscFunctionBegin;
23649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
236567a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236767a45760SJunchao Zhang }
236867a45760SJunchao Zhang 
23698eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2370d71ae5a4SJacob Faibussowitsch {
237167a45760SJunchao Zhang   PetscFunctionBegin;
237267a45760SJunchao Zhang   *array = NULL;
23733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237467a45760SJunchao Zhang }
237567a45760SJunchao Zhang 
2376d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377d71ae5a4SJacob Faibussowitsch {
237867a45760SJunchao Zhang   PetscFunctionBegin;
237967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
238167a45760SJunchao Zhang }
238267a45760SJunchao Zhang 
2383d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2384d71ae5a4SJacob Faibussowitsch {
238567a45760SJunchao Zhang   PetscFunctionBegin;
238667a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238767a45760SJunchao Zhang   *array         = NULL;
23883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23897e8381f9SStefano Zampini }
23907e8381f9SStefano Zampini 
2391d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2392d71ae5a4SJacob Faibussowitsch {
23937ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23947ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23957ee59b9bSJunchao Zhang 
23967ee59b9bSJunchao Zhang   PetscFunctionBegin;
23977ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23987ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
23997ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
24007ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
24017ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
24027ee59b9bSJunchao Zhang 
24037ee59b9bSJunchao Zhang   if (i) {
24047ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24057ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24067ee59b9bSJunchao Zhang #else
24077ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24087ee59b9bSJunchao Zhang #endif
24097ee59b9bSJunchao Zhang   }
24107ee59b9bSJunchao Zhang   if (j) {
24117ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24127ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24137ee59b9bSJunchao Zhang #else
24147ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24157ee59b9bSJunchao Zhang #endif
24167ee59b9bSJunchao Zhang   }
24177ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24187ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24207ee59b9bSJunchao Zhang }
24217ee59b9bSJunchao Zhang 
2422d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2423d71ae5a4SJacob Faibussowitsch {
2424aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24257c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24269ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2427213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2428aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2429abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24309ae82921SPaul Mullowney 
24319ae82921SPaul Mullowney   PetscFunctionBegin;
243228b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2433c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2434a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2435a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2436afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243785ba7357SStefano Zampini 
243808401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24399566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2440afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24419566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2442f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24439566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24449566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
244534d6c7a5SJose E. Roman     } else {
2446abb89eb1SStefano Zampini       PetscInt nnz;
24479566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24489566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24499566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24507c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
245181902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2452a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2453a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24549ae82921SPaul Mullowney       try {
24559ae82921SPaul Mullowney         if (a->compressedrow.use) {
24569ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24579ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24589ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24599ae82921SPaul Mullowney         } else {
2460213423ffSJunchao Zhang           m    = A->rmap->n;
2461213423ffSJunchao Zhang           ii   = a->i;
2462e6e9a74fSStefano Zampini           ridx = NULL;
24639ae82921SPaul Mullowney         }
246408401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24659371c9d4SSatish Balay         if (!a->a) {
24669371c9d4SSatish Balay           nnz  = ii[m];
24679371c9d4SSatish Balay           both = PETSC_FALSE;
24689371c9d4SSatish Balay         } else nnz = a->nz;
246908401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24709ae82921SPaul Mullowney 
247185ba7357SStefano Zampini         /* create cusparse matrix */
2472abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2473aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24779ae82921SPaul Mullowney 
2478f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2479f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2480f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
24819566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2485b06137fdSPaul Mullowney 
2486aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2487aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2488aa372e3fSPaul Mullowney           /* set the matrix */
2489afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2490afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2491afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2492abb89eb1SStefano Zampini           mat->num_entries = nnz;
2493ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2494afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
24959ae82921SPaul Mullowney 
2496ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2497abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2498aa372e3fSPaul Mullowney 
2499ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2500abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2501aa372e3fSPaul Mullowney 
2502aa372e3fSPaul Mullowney           /* assign the pointer */
2503afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2504afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2505afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25069371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25079371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25089371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2509afb2bd1cSJunchao Zhang           }
2510afb2bd1cSJunchao Zhang #endif
2511aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2512afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2513afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2514afb2bd1cSJunchao Zhang #else
2515afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2516afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2517afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2518abb89eb1SStefano Zampini           mat->num_entries = nnz;
2519ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2520afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2521aa372e3fSPaul Mullowney 
2522ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2523abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2524aa372e3fSPaul Mullowney 
2525ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2526abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2527aa372e3fSPaul Mullowney 
2528aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25299566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25309371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25319371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25329371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2533aa372e3fSPaul Mullowney           /* assign the pointer */
2534aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2535aa372e3fSPaul Mullowney 
2536afb2bd1cSJunchao Zhang           if (mat) {
2537afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2538afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2539afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2540afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2541087f3262SPaul Mullowney           }
2542afb2bd1cSJunchao Zhang #endif
2543087f3262SPaul Mullowney         }
2544ca45077fSPaul Mullowney 
2545aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2546213423ffSJunchao Zhang         if (a->compressedrow.use) {
2547ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2548ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2549aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2550213423ffSJunchao Zhang           tmp = m;
2551213423ffSJunchao Zhang         } else {
2552213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2553213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2554213423ffSJunchao Zhang           tmp                        = 0;
2555213423ffSJunchao Zhang         }
25569566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2557aa372e3fSPaul Mullowney 
2558aa372e3fSPaul Mullowney         /* assign the pointer */
2559aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2560d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2561d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2562d71ae5a4SJacob Faibussowitsch       }
25639566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25649566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
256534d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256634d6c7a5SJose E. Roman     }
2567abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25689ae82921SPaul Mullowney   }
25693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25709ae82921SPaul Mullowney }
25719ae82921SPaul Mullowney 
25729371c9d4SSatish Balay struct VecCUDAPlusEquals {
2573aa372e3fSPaul Mullowney   template <typename Tuple>
2574d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2575d71ae5a4SJacob Faibussowitsch   {
2576aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2577aa372e3fSPaul Mullowney   }
2578aa372e3fSPaul Mullowney };
2579aa372e3fSPaul Mullowney 
25809371c9d4SSatish Balay struct VecCUDAEquals {
25817e8381f9SStefano Zampini   template <typename Tuple>
2582d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2583d71ae5a4SJacob Faibussowitsch   {
25847e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25857e8381f9SStefano Zampini   }
25867e8381f9SStefano Zampini };
25877e8381f9SStefano Zampini 
25889371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2589e6e9a74fSStefano Zampini   template <typename Tuple>
2590d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2591d71ae5a4SJacob Faibussowitsch   {
2592e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2593e6e9a74fSStefano Zampini   }
2594e6e9a74fSStefano Zampini };
2595e6e9a74fSStefano Zampini 
2596afb2bd1cSJunchao Zhang struct MatMatCusparse {
2597ccdfe979SStefano Zampini   PetscBool      cisdense;
2598ccdfe979SStefano Zampini   PetscScalar   *Bt;
2599ccdfe979SStefano Zampini   Mat            X;
2600fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2601fcdce8c4SStefano Zampini   PetscLogDouble flops;
2602fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2603b4285af6SJunchao Zhang 
2604afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2605fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2606afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2607afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2608afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2609afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2610b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2611b4285af6SJunchao Zhang   void *dBuffer4;
2612b4285af6SJunchao Zhang   void *dBuffer5;
2613b4285af6SJunchao Zhang   #endif
2614fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2615fcdce8c4SStefano Zampini   void                 *mmBuffer;
2616fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2617fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2618afb2bd1cSJunchao Zhang #endif
2619afb2bd1cSJunchao Zhang };
2620ccdfe979SStefano Zampini 
2621d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2622d71ae5a4SJacob Faibussowitsch {
2623ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2624ccdfe979SStefano Zampini 
2625ccdfe979SStefano Zampini   PetscFunctionBegin;
26269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2627fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2628afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26299566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26309566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26319566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26329566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2633b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26349566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26359566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2636b4285af6SJunchao Zhang   #endif
26379566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26389566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2639afb2bd1cSJunchao Zhang #endif
26409566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
26419566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
26423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2643ccdfe979SStefano Zampini }
2644ccdfe979SStefano Zampini 
26454742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2646ccdfe979SStefano Zampini 
2647d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2648d71ae5a4SJacob Faibussowitsch {
2649ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2650ccdfe979SStefano Zampini   Mat                           A, B;
2651afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2652ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2653ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2654ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2655ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2656ccdfe979SStefano Zampini   const PetscScalar            *barray;
2657ccdfe979SStefano Zampini   PetscScalar                  *carray;
2658ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2659ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2660ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2661ccdfe979SStefano Zampini 
2662ccdfe979SStefano Zampini   PetscFunctionBegin;
2663ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
266428b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2665ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2666ccdfe979SStefano Zampini   A      = product->A;
2667ccdfe979SStefano Zampini   B      = product->B;
26689566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2670ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2671ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
267228b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2674ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2675ccdfe979SStefano Zampini   switch (product->type) {
2676ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2677ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2678ccdfe979SStefano Zampini     mat = cusp->mat;
2679ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2680ccdfe979SStefano Zampini     m   = A->rmap->n;
2681ccdfe979SStefano Zampini     n   = B->cmap->n;
2682ccdfe979SStefano Zampini     break;
2683ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26841a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2685e6e9a74fSStefano Zampini       mat = cusp->mat;
2686e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2687e6e9a74fSStefano Zampini     } else {
26889566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2689ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2690ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2691e6e9a74fSStefano Zampini     }
2692ccdfe979SStefano Zampini     m = A->cmap->n;
2693ccdfe979SStefano Zampini     n = B->cmap->n;
2694ccdfe979SStefano Zampini     break;
2695ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2696ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2697ccdfe979SStefano Zampini     mat = cusp->mat;
2698ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2699ccdfe979SStefano Zampini     m   = A->rmap->n;
2700ccdfe979SStefano Zampini     n   = B->rmap->n;
2701ccdfe979SStefano Zampini     break;
2702d71ae5a4SJacob Faibussowitsch   default:
2703d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2704ccdfe979SStefano Zampini   }
270528b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2706ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2707ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27089566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27099566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2710cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2711afb2bd1cSJunchao Zhang 
27129566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2713c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2714cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27159566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2716c8378d12SStefano Zampini   } else {
2717cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27189566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2719c8378d12SStefano Zampini   }
2720c8378d12SStefano Zampini 
27219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2722afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2723afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2724fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2725fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2726fe5544b9SJunchao Zhang   #else
2727fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2728fe5544b9SJunchao Zhang   #endif
2729fe5544b9SJunchao Zhang 
2730a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2731afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2732fcdce8c4SStefano Zampini     size_t mmBufferSize;
27339371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27349371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27359371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27369371c9d4SSatish Balay     }
2737afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27389566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2740afb2bd1cSJunchao Zhang     }
2741c8378d12SStefano Zampini 
27429371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27439371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27449371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27459371c9d4SSatish Balay     }
2746afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27479566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2748afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2749afb2bd1cSJunchao Zhang     }
2750afb2bd1cSJunchao Zhang 
2751fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2752fe5544b9SJunchao Zhang     if (matADescr) {
275317f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2754fe5544b9SJunchao Zhang       matADescr = NULL;
2755fe5544b9SJunchao Zhang     }
2756fe5544b9SJunchao Zhang   #endif
2757fe5544b9SJunchao Zhang 
2758fe5544b9SJunchao Zhang     if (!matADescr) {
2759fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27609371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27619371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2762afb2bd1cSJunchao Zhang     }
2763fe5544b9SJunchao Zhang 
2764fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2765fe5544b9SJunchao Zhang 
2766fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27679566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27689566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2769fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2770fcdce8c4SStefano Zampini     }
2771fe5544b9SJunchao Zhang 
2772f0b74427SPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2773fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2774fe5544b9SJunchao Zhang   #endif
2775fe5544b9SJunchao Zhang 
2776afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2777afb2bd1cSJunchao Zhang   } else {
2778afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2779fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
27809566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27819566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2782afb2bd1cSJunchao Zhang   }
2783afb2bd1cSJunchao Zhang 
2784afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2785fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2786afb2bd1cSJunchao Zhang #else
2787afb2bd1cSJunchao Zhang   PetscInt k;
2788afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2789ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2790ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2791ccdfe979SStefano Zampini     cublasStatus_t cerr;
2792ccdfe979SStefano Zampini 
27939566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27949371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27959371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2796ccdfe979SStefano Zampini     blda = B->cmap->n;
2797afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2798afb2bd1cSJunchao Zhang   } else {
2799afb2bd1cSJunchao Zhang     k = B->rmap->n;
2800ccdfe979SStefano Zampini   }
2801ccdfe979SStefano Zampini 
2802afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
28039371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
28049371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2805afb2bd1cSJunchao Zhang #endif
28069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2808cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2809ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2810cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28114742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2812ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2813cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28144742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2815ccdfe979SStefano Zampini   } else {
2816cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2817ccdfe979SStefano Zampini   }
281848a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
281948a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2821ccdfe979SStefano Zampini }
2822ccdfe979SStefano Zampini 
2823d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2824d71ae5a4SJacob Faibussowitsch {
2825ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2826ccdfe979SStefano Zampini   Mat                 A, B;
2827ccdfe979SStefano Zampini   PetscInt            m, n;
2828ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2829ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2830ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2831ccdfe979SStefano Zampini 
2832ccdfe979SStefano Zampini   PetscFunctionBegin;
2833ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
283428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2835ccdfe979SStefano Zampini   A = product->A;
2836ccdfe979SStefano Zampini   B = product->B;
28379566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
283828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2839ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
284008401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2841ccdfe979SStefano Zampini   switch (product->type) {
2842ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2843ccdfe979SStefano Zampini     m = A->rmap->n;
2844ccdfe979SStefano Zampini     n = B->cmap->n;
28450e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2846ccdfe979SStefano Zampini     break;
2847ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2848ccdfe979SStefano Zampini     m = A->cmap->n;
2849ccdfe979SStefano Zampini     n = B->cmap->n;
28500e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
28510e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2852ccdfe979SStefano Zampini     break;
2853ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2854ccdfe979SStefano Zampini     m = A->rmap->n;
2855ccdfe979SStefano Zampini     n = B->rmap->n;
28560e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
28570e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2858ccdfe979SStefano Zampini     break;
2859ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2860ccdfe979SStefano Zampini     m = B->cmap->n;
2861ccdfe979SStefano Zampini     n = B->cmap->n;
28620e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
28630e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2864ccdfe979SStefano Zampini     break;
2865ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2866ccdfe979SStefano Zampini     m = B->rmap->n;
2867ccdfe979SStefano Zampini     n = B->rmap->n;
28680e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
28690e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2870ccdfe979SStefano Zampini     break;
2871d71ae5a4SJacob Faibussowitsch   default:
2872d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2873ccdfe979SStefano Zampini   }
28749566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2875ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28769566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28779566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2878ccdfe979SStefano Zampini 
2879ccdfe979SStefano Zampini   /* product data */
28809566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2881ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2882afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2883afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
288448a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2885afb2bd1cSJunchao Zhang #endif
2886ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2887ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28889566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28899566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2890ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28919566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2892ccdfe979SStefano Zampini     } else {
28939566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2894ccdfe979SStefano Zampini     }
2895ccdfe979SStefano Zampini   }
2896ccdfe979SStefano Zampini   C->product->data    = mmdata;
2897ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2898ccdfe979SStefano Zampini 
2899ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
29003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2901ccdfe979SStefano Zampini }
2902ccdfe979SStefano Zampini 
2903d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2904d71ae5a4SJacob Faibussowitsch {
2905ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2906fcdce8c4SStefano Zampini   Mat                           A, B;
2907fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2908fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2909fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2910fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2911fcdce8c4SStefano Zampini   PetscBool                     flg;
2912fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2913fcdce8c4SStefano Zampini   MatProductType                ptype;
2914fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2915fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2916fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2917fcdce8c4SStefano Zampini #endif
2918b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2919ccdfe979SStefano Zampini 
2920ccdfe979SStefano Zampini   PetscFunctionBegin;
2921ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
292228b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29239566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
292428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2925fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2926fcdce8c4SStefano Zampini   A      = product->A;
2927fcdce8c4SStefano Zampini   B      = product->B;
2928fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2929fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2930fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
293108401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2932fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
293328b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2934fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
293528b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2936fcdce8c4SStefano Zampini     goto finalize;
2937fcdce8c4SStefano Zampini   }
2938fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29399566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
294028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29419566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
294228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
294328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
294428b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2946fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2947fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
294808401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294908401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
295008401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29519566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2953fcdce8c4SStefano Zampini 
2954fcdce8c4SStefano Zampini   ptype = product->type;
2955b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2956fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2958fa046f9fSJunchao Zhang   }
2959b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2960fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
296128b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2962fa046f9fSJunchao Zhang   }
2963fcdce8c4SStefano Zampini   switch (ptype) {
2964fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2965fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2966fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2967fcdce8c4SStefano Zampini     break;
2968fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2969fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2970fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2971fcdce8c4SStefano Zampini     break;
2972fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2973fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2974fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2975fcdce8c4SStefano Zampini     break;
2976d71ae5a4SJacob Faibussowitsch   default:
2977d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2978fcdce8c4SStefano Zampini   }
2979fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
298028b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
298128b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
298228b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2983fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2984fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2985fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
298628b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
298728b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
298828b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2990fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29929566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2993b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29949371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29959371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2996b4285af6SJunchao Zhang   #else
29979371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29989371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29999371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3001b4285af6SJunchao Zhang   #endif
3002fcdce8c4SStefano Zampini #else
30039371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30049371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30059371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3006fcdce8c4SStefano Zampini #endif
30079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30089566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3010fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3011fcdce8c4SStefano Zampini finalize:
3012fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30139566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30149566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30159566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3016fcdce8c4SStefano Zampini   c->reallocs = 0;
3017fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3018fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3019fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3020fcdce8c4SStefano Zampini   C->num_ass++;
30213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3022ccdfe979SStefano Zampini }
3023fcdce8c4SStefano Zampini 
3024d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3025d71ae5a4SJacob Faibussowitsch {
3026fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3027fcdce8c4SStefano Zampini   Mat                           A, B;
3028fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3029fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3030fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3031fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3032fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3033fcdce8c4SStefano Zampini   PetscBool                     flg;
3034fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3035fcdce8c4SStefano Zampini   MatProductType                ptype;
3036fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3037fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3038fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3039fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3040fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3041fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3042fcdce8c4SStefano Zampini #else
3043fcdce8c4SStefano Zampini   int cnz;
3044fcdce8c4SStefano Zampini #endif
3045b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3046fcdce8c4SStefano Zampini 
3047fcdce8c4SStefano Zampini   PetscFunctionBegin;
3048fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
304928b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3050fcdce8c4SStefano Zampini   A = product->A;
3051fcdce8c4SStefano Zampini   B = product->B;
30529566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
305328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30549566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
305528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3056fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3057fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3058fcdce8c4SStefano Zampini   /* product data */
30599566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3060fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3061fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3062fcdce8c4SStefano Zampini 
30639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3065d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3066d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
306708401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
306808401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069d60bce21SJunchao Zhang 
3070fcdce8c4SStefano Zampini   ptype = product->type;
3071b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3072fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3073fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3074fa046f9fSJunchao Zhang   }
3075b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3076fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3077fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3078fa046f9fSJunchao Zhang   }
3079fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3080fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3081fcdce8c4SStefano Zampini   switch (ptype) {
3082fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3083fcdce8c4SStefano Zampini     m    = A->rmap->n;
3084fcdce8c4SStefano Zampini     n    = B->cmap->n;
3085fcdce8c4SStefano Zampini     k    = A->cmap->n;
3086fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3087fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3088fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3089fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3090fcdce8c4SStefano Zampini     break;
3091fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3092fcdce8c4SStefano Zampini     m = A->cmap->n;
3093fcdce8c4SStefano Zampini     n = B->cmap->n;
3094fcdce8c4SStefano Zampini     k = A->rmap->n;
30959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3096fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3097fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3098fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3099fcdce8c4SStefano Zampini     break;
3100fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3101fcdce8c4SStefano Zampini     m = A->rmap->n;
3102fcdce8c4SStefano Zampini     n = B->rmap->n;
3103fcdce8c4SStefano Zampini     k = A->cmap->n;
31049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3105fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3106fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3107fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3108fcdce8c4SStefano Zampini     break;
3109d71ae5a4SJacob Faibussowitsch   default:
3110d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3111fcdce8c4SStefano Zampini   }
3112fcdce8c4SStefano Zampini 
3113fcdce8c4SStefano Zampini   /* create cusparse matrix */
31149566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31159566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3116fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3117fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3118fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3119fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3120fcdce8c4SStefano Zampini 
3121fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3122fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3123fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31249566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31259566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3126fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3127fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3128fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3129fcdce8c4SStefano Zampini   } else {
3130fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3131fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3132fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3133fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3134fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3135fcdce8c4SStefano Zampini   }
3136fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3137fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3138fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3139fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3140fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3141fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3145f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3146f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3147f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3152d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3153fcdce8c4SStefano Zampini     c->nz                = 0;
3154fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3155fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3156fcdce8c4SStefano Zampini     goto finalizesym;
3157fcdce8c4SStefano Zampini   }
3158fcdce8c4SStefano Zampini 
315928b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
316028b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3161fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3162fcdce8c4SStefano Zampini   if (!biscompressed) {
3163fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3164fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3165fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3166fcdce8c4SStefano Zampini #endif
3167fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3168fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3169fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3170fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3171fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3172fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3173fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3174fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3175fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3176fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3177fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31789566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3179fcdce8c4SStefano Zampini     }
3180fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3181fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3182fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31849371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31859371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3186fcdce8c4SStefano Zampini     }
3187fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3188fcdce8c4SStefano Zampini #endif
3189fcdce8c4SStefano Zampini   }
319028b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
319128b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3192fcdce8c4SStefano Zampini   /* precompute flops count */
3193fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3194fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3195fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3196fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3197fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3198fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3199fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3200fcdce8c4SStefano Zampini       }
3201fcdce8c4SStefano Zampini     }
3202fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3203fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3204fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3205fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3206fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3207fcdce8c4SStefano Zampini     }
3208fcdce8c4SStefano Zampini   } else { /* TODO */
3209fcdce8c4SStefano Zampini     flops = 0.;
3210fcdce8c4SStefano Zampini   }
3211fcdce8c4SStefano Zampini 
3212fcdce8c4SStefano Zampini   mmdata->flops = flops;
32139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3214b4285af6SJunchao Zhang 
3215fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32169566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32171ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32181ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3221b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3222b4285af6SJunchao Zhang   {
3223b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3224b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3225b4285af6SJunchao Zhang   */
3226b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3227b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3228b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3229b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3230b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3231b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3232b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3233b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3234b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3235b4285af6SJunchao Zhang 
3236b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32379371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32389371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3240b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32419371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32429371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3243b4285af6SJunchao Zhang 
32449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32499371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32509371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3253b4285af6SJunchao Zhang 
3254b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32559566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3256b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3257b4285af6SJunchao Zhang     /* allocate matrix C */
32589371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32599371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32609371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32619371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3262b4285af6SJunchao Zhang     /* update matC with the new pointers */
32639371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32649371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3265b4285af6SJunchao Zhang 
32669371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32679371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32699371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32709371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32729371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32739371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3275b4285af6SJunchao Zhang   }
3276ae37ee31SJunchao Zhang   #else
3277b4285af6SJunchao Zhang   size_t bufSize2;
3278fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32799371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32809371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32819566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3282fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32839371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32849371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3285fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32869371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32879371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3288fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3289fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3290fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3291fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3292fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32939566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3294fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32959371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32969371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3297fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32989566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3299fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
33009371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
33019371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3302fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33039566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33059566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33069371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33079371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33089371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33099371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3310ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3311fcdce8c4SStefano Zampini #else
33129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33139371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33149371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33159371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3316fcdce8c4SStefano Zampini   c->nz                = cnz;
3317fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3321fcdce8c4SStefano Zampini 
33229566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3323fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3324fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3325fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33269371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33279371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33289371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3329fcdce8c4SStefano Zampini #endif
33309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3332fcdce8c4SStefano Zampini finalizesym:
3333fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33349f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33359f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3336fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33377de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3338fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3339fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3340fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3341fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3342fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3343fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346fcdce8c4SStefano Zampini   } else {
3347fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3348fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351fcdce8c4SStefano Zampini   }
3352fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3353fcdce8c4SStefano Zampini     PetscInt r = 0;
3354fcdce8c4SStefano Zampini     c->i[0]    = 0;
3355fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3356fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3357fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3358fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3359fcdce8c4SStefano Zampini     }
3360fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3361fcdce8c4SStefano Zampini   }
33629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3365fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3366fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3367fcdce8c4SStefano Zampini   c->rmax          = 0;
3368fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3369fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3370fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3371fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3372fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3373fcdce8c4SStefano Zampini   }
33749566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3376fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3377fcdce8c4SStefano Zampini 
3378fcdce8c4SStefano Zampini   C->nonzerostate++;
33799566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33809566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3381fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3382fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3383fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3384fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3385fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3386abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3387fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3388fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3389fcdce8c4SStefano Zampini   }
3390fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3392fcdce8c4SStefano Zampini }
3393fcdce8c4SStefano Zampini 
3394fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3395fcdce8c4SStefano Zampini 
3396fcdce8c4SStefano Zampini /* handles sparse or dense B */
3397d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3398d71ae5a4SJacob Faibussowitsch {
3399fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3400fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3401fcdce8c4SStefano Zampini 
3402fcdce8c4SStefano Zampini   PetscFunctionBegin;
3403fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
34049566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
340548a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3406fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3407fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
340848a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3409fcdce8c4SStefano Zampini   }
341065e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
341165e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
341265e4b4d4SStefano Zampini     switch (product->type) {
341365e4b4d4SStefano Zampini     case MATPRODUCT_AB:
341465e4b4d4SStefano Zampini       if (product->api_user) {
3415d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34169566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3417d0609cedSBarry Smith         PetscOptionsEnd();
341865e4b4d4SStefano Zampini       } else {
3419d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34209566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3421d0609cedSBarry Smith         PetscOptionsEnd();
342265e4b4d4SStefano Zampini       }
342365e4b4d4SStefano Zampini       break;
342465e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
342565e4b4d4SStefano Zampini       if (product->api_user) {
3426d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34279566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3428d0609cedSBarry Smith         PetscOptionsEnd();
342965e4b4d4SStefano Zampini       } else {
3430d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34319566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3432d0609cedSBarry Smith         PetscOptionsEnd();
343365e4b4d4SStefano Zampini       }
343465e4b4d4SStefano Zampini       break;
343565e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
343665e4b4d4SStefano Zampini       if (product->api_user) {
3437d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34389566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3439d0609cedSBarry Smith         PetscOptionsEnd();
344065e4b4d4SStefano Zampini       } else {
3441d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34429566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3443d0609cedSBarry Smith         PetscOptionsEnd();
344465e4b4d4SStefano Zampini       }
344565e4b4d4SStefano Zampini       break;
344665e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
344765e4b4d4SStefano Zampini       if (product->api_user) {
3448d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34499566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3450d0609cedSBarry Smith         PetscOptionsEnd();
345165e4b4d4SStefano Zampini       } else {
3452d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34539566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3454d0609cedSBarry Smith         PetscOptionsEnd();
345565e4b4d4SStefano Zampini       }
345665e4b4d4SStefano Zampini       break;
345765e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
345865e4b4d4SStefano Zampini       if (product->api_user) {
3459d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34609566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3461d0609cedSBarry Smith         PetscOptionsEnd();
346265e4b4d4SStefano Zampini       } else {
3463d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34649566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3465d0609cedSBarry Smith         PetscOptionsEnd();
346665e4b4d4SStefano Zampini       }
346765e4b4d4SStefano Zampini       break;
3468d71ae5a4SJacob Faibussowitsch     default:
3469d71ae5a4SJacob Faibussowitsch       break;
347065e4b4d4SStefano Zampini     }
347165e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
347265e4b4d4SStefano Zampini   }
347365e4b4d4SStefano Zampini   /* dispatch */
3474fcdce8c4SStefano Zampini   if (isdense) {
3475ccdfe979SStefano Zampini     switch (product->type) {
3476ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3477ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3478ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3479ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3480ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3481fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34829566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3483fcdce8c4SStefano Zampini       } else {
3484fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3485fcdce8c4SStefano Zampini       }
3486fcdce8c4SStefano Zampini       break;
3487d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3488d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3489d71ae5a4SJacob Faibussowitsch       break;
3490d71ae5a4SJacob Faibussowitsch     default:
3491d71ae5a4SJacob Faibussowitsch       break;
3492ccdfe979SStefano Zampini     }
3493fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3494fcdce8c4SStefano Zampini     switch (product->type) {
3495fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3496fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3497d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3498d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3499d71ae5a4SJacob Faibussowitsch       break;
3500fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3501fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3502d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3503d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3504d71ae5a4SJacob Faibussowitsch       break;
3505d71ae5a4SJacob Faibussowitsch     default:
3506d71ae5a4SJacob Faibussowitsch       break;
3507fcdce8c4SStefano Zampini     }
3508fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35099566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3510fcdce8c4SStefano Zampini   }
35113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3512ccdfe979SStefano Zampini }
3513ccdfe979SStefano Zampini 
3514d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3515d71ae5a4SJacob Faibussowitsch {
35169ae82921SPaul Mullowney   PetscFunctionBegin;
35179566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3519e6e9a74fSStefano Zampini }
3520e6e9a74fSStefano Zampini 
3521d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3522d71ae5a4SJacob Faibussowitsch {
3523e6e9a74fSStefano Zampini   PetscFunctionBegin;
35249566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3526e6e9a74fSStefano Zampini }
3527e6e9a74fSStefano Zampini 
3528d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3529d71ae5a4SJacob Faibussowitsch {
3530e6e9a74fSStefano Zampini   PetscFunctionBegin;
35319566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3533e6e9a74fSStefano Zampini }
3534e6e9a74fSStefano Zampini 
3535d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3536d71ae5a4SJacob Faibussowitsch {
3537e6e9a74fSStefano Zampini   PetscFunctionBegin;
35389566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35409ae82921SPaul Mullowney }
35419ae82921SPaul Mullowney 
3542d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3543d71ae5a4SJacob Faibussowitsch {
3544ca45077fSPaul Mullowney   PetscFunctionBegin;
35459566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3547ca45077fSPaul Mullowney }
3548ca45077fSPaul Mullowney 
3549d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3550d71ae5a4SJacob Faibussowitsch {
3551a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3552a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3553a0e72f99SJunchao Zhang }
3554a0e72f99SJunchao Zhang 
3555afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3556d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3557d71ae5a4SJacob Faibussowitsch {
35589ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3559aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35609ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3561e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3562e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3563e6e9a74fSStefano Zampini   PetscBool                     compressed;
3564afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3565afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3566afb2bd1cSJunchao Zhang #endif
35676e111a19SKarl Rupp 
35689ae82921SPaul Mullowney   PetscFunctionBegin;
356908401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3570cbc6b225SStefano Zampini   if (!a->nz) {
3571995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3572995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35733ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3574e6e9a74fSStefano Zampini   }
357534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3577e6e9a74fSStefano Zampini   if (!trans) {
35789ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35795f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3580e6e9a74fSStefano Zampini   } else {
35811a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3582e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3583e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3584e6e9a74fSStefano Zampini     } else {
35859566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3586e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3587e6e9a74fSStefano Zampini     }
3588e6e9a74fSStefano Zampini   }
3589e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3590e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3591213423ffSJunchao Zhang 
3592e6e9a74fSStefano Zampini   try {
35939566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
359469d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35959566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3596afb2bd1cSJunchao Zhang 
35979566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3598e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3599afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3600afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3601afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3602afb2bd1cSJunchao Zhang       */
3603e6e9a74fSStefano Zampini       xptr = xarray;
3604afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3605213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3606afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3607afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3608afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3609afb2bd1cSJunchao Zhang        */
3610afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3611afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3612fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3613afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3614afb2bd1cSJunchao Zhang       }
3615afb2bd1cSJunchao Zhang #endif
3616e6e9a74fSStefano Zampini     } else {
3617afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3618afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3619afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3620afb2bd1cSJunchao Zhang        */
3621afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3622e6e9a74fSStefano Zampini       dptr = zarray;
3623e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3624afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3625e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3626d0967f54SJacob Faibussowitsch 
3627d0967f54SJacob Faibussowitsch         thrust::for_each(
3628d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3629d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3630d0967f54SJacob Faibussowitsch #endif
3631d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36329371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3633e6e9a74fSStefano Zampini       }
3634afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3636afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3637fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3638afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3639afb2bd1cSJunchao Zhang       }
3640afb2bd1cSJunchao Zhang #endif
3641e6e9a74fSStefano Zampini     }
36429ae82921SPaul Mullowney 
3643afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3644aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3645afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3646fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3647fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3648fe5544b9SJunchao Zhang   #else
3649fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3650fe5544b9SJunchao Zhang   #endif
3651fe5544b9SJunchao Zhang 
36525f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3653fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3654fe5544b9SJunchao Zhang       if (!matDescr) {
3655fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3656fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3657fe5544b9SJunchao Zhang       }
3658fe5544b9SJunchao Zhang   #endif
3659fe5544b9SJunchao Zhang 
3660afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36639371c9d4SSatish Balay         PetscCallCUSPARSE(
3664fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3666fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3667fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3668fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3669fe5544b9SJunchao Zhang   #endif
3670afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3671afb2bd1cSJunchao Zhang       } else {
3672afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3675afb2bd1cSJunchao Zhang       }
3676afb2bd1cSJunchao Zhang 
3677fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3678afb2bd1cSJunchao Zhang #else
36797656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36809371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3681afb2bd1cSJunchao Zhang #endif
3682aa372e3fSPaul Mullowney     } else {
3683213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3684afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3685afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3686afb2bd1cSJunchao Zhang #else
3687301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36889371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3689afb2bd1cSJunchao Zhang #endif
3690a65300a6SPaul Mullowney       }
3691aa372e3fSPaul Mullowney     }
36929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3693aa372e3fSPaul Mullowney 
3694e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3695213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3696213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3697995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3698e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3699995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
37007656d835SStefano Zampini         }
3701213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3702995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
37037656d835SStefano Zampini       }
37047656d835SStefano Zampini 
3705213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3706213423ffSJunchao Zhang       if (compressed) {
37079566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
37086497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37096497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
37109566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3711e6e9a74fSStefano Zampini       }
3712e6e9a74fSStefano Zampini     } else {
3713995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3714e6e9a74fSStefano Zampini     }
37159566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37169566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37179566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3718d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3719d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3720d71ae5a4SJacob Faibussowitsch   }
3721e6e9a74fSStefano Zampini   if (yy) {
37229566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3723e6e9a74fSStefano Zampini   } else {
37249566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3725e6e9a74fSStefano Zampini   }
37263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37279ae82921SPaul Mullowney }
37289ae82921SPaul Mullowney 
3729d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3730d71ae5a4SJacob Faibussowitsch {
3731ca45077fSPaul Mullowney   PetscFunctionBegin;
37329566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3734ca45077fSPaul Mullowney }
3735ca45077fSPaul Mullowney 
3736d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3737d71ae5a4SJacob Faibussowitsch {
3738042217e8SBarry Smith   PetscFunctionBegin;
37399566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37419ae82921SPaul Mullowney }
37429ae82921SPaul Mullowney 
3743e057df02SPaul Mullowney /*@
374453220ed8SBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
37459ae82921SPaul Mullowney 
3746d083f849SBarry Smith   Collective
37479ae82921SPaul Mullowney 
37489ae82921SPaul Mullowney   Input Parameters:
374911a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37509ae82921SPaul Mullowney . m    - number of rows
37519ae82921SPaul Mullowney . n    - number of columns
375220f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
375320f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37549ae82921SPaul Mullowney 
37559ae82921SPaul Mullowney   Output Parameter:
37569ae82921SPaul Mullowney . A - the matrix
37579ae82921SPaul Mullowney 
37582ef1f0ffSBarry Smith   Level: intermediate
37592ef1f0ffSBarry Smith 
37602ef1f0ffSBarry Smith   Notes:
37612920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37622920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
37632920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
37642920cce0SJacob Faibussowitsch 
376511a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
37669ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
376711a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
37689ae82921SPaul Mullowney 
376911a5261eSBarry Smith   The AIJ format, also called
37702ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
37719ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
377220f4b53cSBarry Smith   either one (as in Fortran) or zero.
37739ae82921SPaul Mullowney 
37749ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
37752ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
377620f4b53cSBarry Smith   allocation.
37779ae82921SPaul Mullowney 
377853220ed8SBarry Smith   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
377953220ed8SBarry Smith 
378053220ed8SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
378153220ed8SBarry Smith           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
37829ae82921SPaul Mullowney @*/
3783d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3784d71ae5a4SJacob Faibussowitsch {
37859ae82921SPaul Mullowney   PetscFunctionBegin;
37869566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
37879566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
37889566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
37899566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
37903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37919ae82921SPaul Mullowney }
37929ae82921SPaul Mullowney 
3793d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3794d71ae5a4SJacob Faibussowitsch {
37959ae82921SPaul Mullowney   PetscFunctionBegin;
37969ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
37972c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
37989ae82921SPaul Mullowney   } else {
37999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3800aa372e3fSPaul Mullowney   }
38019566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38029566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38039566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38049566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38059566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38069566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38079566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38089566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38099566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38109566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38119566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38139ae82921SPaul Mullowney }
38149ae82921SPaul Mullowney 
3815ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
381695639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3817d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3818d71ae5a4SJacob Faibussowitsch {
38199ff858a8SKarl Rupp   PetscFunctionBegin;
38209566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38219566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38239ff858a8SKarl Rupp }
38249ff858a8SKarl Rupp 
3825d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3826d71ae5a4SJacob Faibussowitsch {
3827a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3828039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3829039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3830039c6fbaSStefano Zampini   PetscScalar        *ay;
3831039c6fbaSStefano Zampini   const PetscScalar  *ax;
3832039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3833e6e9a74fSStefano Zampini 
383495639643SRichard Tran Mills   PetscFunctionBegin;
3835a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3836a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3837039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38389566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38399566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38403ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
384195639643SRichard Tran Mills   }
3842039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38455f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38465f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3847039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3848039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3849039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3850039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3851039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3852ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3853039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3854039c6fbaSStefano Zampini   }
3855d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3856d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3857039c6fbaSStefano Zampini 
3858039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3859039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3860039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3861039c6fbaSStefano Zampini     size_t bufferSize;
3862039c6fbaSStefano Zampini     void  *buffer;
3863039c6fbaSStefano Zampini #endif
3864039c6fbaSStefano Zampini 
38659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38669566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38679566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3868039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38699371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38709371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
38719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
38729566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38739371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38749371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
38759566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38769566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3878039c6fbaSStefano Zampini #else
38799566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38809371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38819371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
38829566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38839566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3884039c6fbaSStefano Zampini #endif
38859566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
38869566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
38879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3889039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3890a587d139SMark     cublasHandle_t cublasv2handle;
3891a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3892039c6fbaSStefano Zampini 
38939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38959566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
38969566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
38979566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38989566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
38999566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39009566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3904039c6fbaSStefano Zampini   } else {
39059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39069566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3907a587d139SMark   }
39083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
390995639643SRichard Tran Mills }
391095639643SRichard Tran Mills 
3911d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3912d71ae5a4SJacob Faibussowitsch {
391333c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
391433c9ba73SStefano Zampini   PetscScalar   *ay;
391533c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
391633c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
391733c9ba73SStefano Zampini 
391833c9ba73SStefano Zampini   PetscFunctionBegin;
39199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39209566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39219566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39239566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39249566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
39283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
392933c9ba73SStefano Zampini }
393033c9ba73SStefano Zampini 
3931d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3932d71ae5a4SJacob Faibussowitsch {
39337e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3934a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
39357e8381f9SStefano Zampini 
39363fa6b06aSMark Adams   PetscFunctionBegin;
39373fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39383fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39397e8381f9SStefano Zampini     if (spptr->mat) {
39407e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39417e8381f9SStefano Zampini       if (matrix->values) {
39427e8381f9SStefano Zampini         both = PETSC_TRUE;
39437e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39447e8381f9SStefano Zampini       }
39457e8381f9SStefano Zampini     }
39467e8381f9SStefano Zampini     if (spptr->matTranspose) {
39477e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3948ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39497e8381f9SStefano Zampini     }
39503fa6b06aSMark Adams   }
39519566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
39529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39537e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3954a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
39553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39563fa6b06aSMark Adams }
39573fa6b06aSMark Adams 
39582c55c4ccSJose E. Roman static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
395903db1824SAlex Lindsay {
396003db1824SAlex Lindsay   PetscFunctionBegin;
396103db1824SAlex Lindsay   *m = PETSC_MEMTYPE_CUDA;
396203db1824SAlex Lindsay   PetscFunctionReturn(PETSC_SUCCESS);
396303db1824SAlex Lindsay }
396403db1824SAlex Lindsay 
3965d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3966d71ae5a4SJacob Faibussowitsch {
3967a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3968a587d139SMark 
3969a587d139SMark   PetscFunctionBegin;
39709a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
39719a14fc28SStefano Zampini     A->boundtocpu = flg;
39723ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
39739a14fc28SStefano Zampini   }
3974a587d139SMark   if (flg) {
39759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3976a587d139SMark 
397733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3978a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3979a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3980a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3981a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3982a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3983a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3984a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3985a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3986fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
398703db1824SAlex Lindsay     A->ops->getcurrentmemtype         = NULL;
39889566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
39899566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
39909566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
39919566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
39929566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
39939566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
39949566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3995a587d139SMark   } else {
399633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3997a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3998a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3999a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4000a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4001a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4002a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4003a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4004a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4005fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
400603db1824SAlex Lindsay     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
400767a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
400867a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
400967a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
401067a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
401167a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
401267a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40137ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40147ee59b9bSJunchao Zhang 
40159566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40169566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40179566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40189566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40199566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40209566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4021a587d139SMark   }
4022a587d139SMark   A->boundtocpu = flg;
40234d12350bSJunchao Zhang   if (flg && a->inode.size_csr) {
4024ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4025ea500dcfSRichard Tran Mills   } else {
4026ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4027ea500dcfSRichard Tran Mills   }
40283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4029a587d139SMark }
4030a587d139SMark 
40318eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4032d71ae5a4SJacob Faibussowitsch {
403349735bf3SStefano Zampini   Mat B;
40349ae82921SPaul Mullowney 
40359ae82921SPaul Mullowney   PetscFunctionBegin;
40369566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
403749735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40389566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
403949735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40409566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
404149735bf3SStefano Zampini   }
404249735bf3SStefano Zampini   B = *newmat;
404349735bf3SStefano Zampini 
40449566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40459566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
404634136279SStefano Zampini 
404749735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40489ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4049e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40509566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40519566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40529566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40531a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4054d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4055b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4056a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4057a435da06SStefano Zampini   #else
4058d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4059a435da06SStefano Zampini   #endif
4060d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4061d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4062d8132acaSStefano Zampini #endif
40631a2c6b5cSJunchao Zhang       B->spptr = spptr;
40649ae82921SPaul Mullowney     } else {
4065e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4066e6e9a74fSStefano Zampini 
40679566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40689566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40699566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4070e6e9a74fSStefano Zampini       B->spptr = spptr;
40719ae82921SPaul Mullowney     }
4072e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
407349735bf3SStefano Zampini   }
4074693b0035SStefano Zampini   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
40759ae82921SPaul Mullowney   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
40761a2c6b5cSJunchao Zhang   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
40779ae82921SPaul Mullowney   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
407895639643SRichard Tran Mills   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4079693b0035SStefano Zampini   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
408003db1824SAlex Lindsay   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
40812205254eSKarl Rupp 
40829566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
40839566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
40849566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4085ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
40869566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4087ae48a8d0SStefano Zampini #endif
40889566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
40893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40909ae82921SPaul Mullowney }
40919ae82921SPaul Mullowney 
4092d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4093d71ae5a4SJacob Faibussowitsch {
409402fe1965SBarry Smith   PetscFunctionBegin;
40959566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
40969566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
40973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
409802fe1965SBarry Smith }
409902fe1965SBarry Smith 
41003ca39a21SBarry Smith /*MC
410153220ed8SBarry Smith    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4102e057df02SPaul Mullowney 
4103e057df02SPaul Mullowney    Options Database Keys:
410453220ed8SBarry Smith +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
410553220ed8SBarry Smith .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41062ef1f0ffSBarry Smith                                            Other options include ell (ellpack) or hyb (hybrid).
410753220ed8SBarry Smith .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
410853220ed8SBarry Smith -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4109e057df02SPaul Mullowney 
4110e057df02SPaul Mullowney   Level: beginner
4111e057df02SPaul Mullowney 
411253220ed8SBarry Smith   Notes:
411353220ed8SBarry Smith   These matrices can be in either CSR, ELL, or HYB format.
411453220ed8SBarry Smith 
411553220ed8SBarry Smith   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
411653220ed8SBarry Smith 
411753220ed8SBarry Smith   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
411853220ed8SBarry Smith   if some integer values passed in do not fit in `int`.
411953220ed8SBarry Smith 
41201cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4121e057df02SPaul Mullowney M*/
41227f756511SDominic Meiser 
4123d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4124d71ae5a4SJacob Faibussowitsch {
412542c9c57cSBarry Smith   PetscFunctionBegin;
41269566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41279566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41289566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41299566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
413142c9c57cSBarry Smith }
413229b38603SBarry Smith 
41332c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4134d71ae5a4SJacob Faibussowitsch {
41352c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4136cbc6b225SStefano Zampini 
4137cbc6b225SStefano Zampini   PetscFunctionBegin;
41382c4ab24aSJunchao Zhang   if (cusp) {
41392c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41402c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41412c4ab24aSJunchao Zhang     delete cusp->workVector;
41422c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41432c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41442c4ab24aSJunchao Zhang     delete cusp->coords;
41452c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41462c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41477f756511SDominic Meiser   }
41483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41497f756511SDominic Meiser }
41507f756511SDominic Meiser 
4151d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4152d71ae5a4SJacob Faibussowitsch {
41537f756511SDominic Meiser   PetscFunctionBegin;
41547f756511SDominic Meiser   if (*mat) {
41557f756511SDominic Meiser     delete (*mat)->values;
41567f756511SDominic Meiser     delete (*mat)->column_indices;
41577f756511SDominic Meiser     delete (*mat)->row_offsets;
41587f756511SDominic Meiser     delete *mat;
41597f756511SDominic Meiser     *mat = 0;
41607f756511SDominic Meiser   }
41613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41627f756511SDominic Meiser }
41637f756511SDominic Meiser 
4164b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4165d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4166d71ae5a4SJacob Faibussowitsch {
41677f756511SDominic Meiser   PetscFunctionBegin;
41687f756511SDominic Meiser   if (*trifactor) {
41699566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4170261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
41719566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
41729566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
41739566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4174afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
41759566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4176afb2bd1cSJunchao Zhang   #endif
41779566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
41787f756511SDominic Meiser   }
41793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41807f756511SDominic Meiser }
4181d460d7bfSJunchao Zhang #endif
41827f756511SDominic Meiser 
4183d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4184d71ae5a4SJacob Faibussowitsch {
41857f756511SDominic Meiser   CsrMatrix *mat;
41867f756511SDominic Meiser 
41877f756511SDominic Meiser   PetscFunctionBegin;
41887f756511SDominic Meiser   if (*matstruct) {
41897f756511SDominic Meiser     if ((*matstruct)->mat) {
41907f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4191afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4192afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4193afb2bd1cSJunchao Zhang #else
41947f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
41959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4196afb2bd1cSJunchao Zhang #endif
41977f756511SDominic Meiser       } else {
41987f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
41993ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
42007f756511SDominic Meiser       }
42017f756511SDominic Meiser     }
42029566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
42037f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
42049566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
42059566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42069566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4207afb2bd1cSJunchao Zhang 
4208afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4209afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42109566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4211fe5544b9SJunchao Zhang 
4212afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4213afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42149566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4217fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4218fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4219fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4220fe5544b9SJunchao Zhang   #endif
4221afb2bd1cSJunchao Zhang       }
4222afb2bd1cSJunchao Zhang     }
4223afb2bd1cSJunchao Zhang #endif
42247f756511SDominic Meiser     delete *matstruct;
42257e8381f9SStefano Zampini     *matstruct = NULL;
42267f756511SDominic Meiser   }
42273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42287f756511SDominic Meiser }
42297f756511SDominic Meiser 
4230d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4231d71ae5a4SJacob Faibussowitsch {
4232da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4233da112707SJunchao Zhang 
42347f756511SDominic Meiser   PetscFunctionBegin;
4235da112707SJunchao Zhang   if (fs) {
4236b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4237da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4238da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4239da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4240da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4241d460d7bfSJunchao Zhang     delete fs->workVector;
4242d460d7bfSJunchao Zhang     fs->workVector = NULL;
4243d460d7bfSJunchao Zhang #endif
4244da112707SJunchao Zhang     delete fs->rpermIndices;
4245da112707SJunchao Zhang     delete fs->cpermIndices;
4246da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4247da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4248da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4249b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4250da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4251da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
425230807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
425330807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4254da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4255d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4256da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4257da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
425812ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4259da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4260da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
426112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4262da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4263da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4264da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4265da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4266da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4267da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4268da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4269da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4270da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4271da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4272da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4273da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4274d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4275d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4276d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
427712ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
427812ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4279da112707SJunchao Zhang #endif
4280ccdfe979SStefano Zampini   }
42813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4282ccdfe979SStefano Zampini }
4283ccdfe979SStefano Zampini 
4284d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4285d71ae5a4SJacob Faibussowitsch {
4286ccdfe979SStefano Zampini   PetscFunctionBegin;
4287ccdfe979SStefano Zampini   if (*trifactors) {
42889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4289f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
42909566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
42917f756511SDominic Meiser   }
42923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42937f756511SDominic Meiser }
42947e8381f9SStefano Zampini 
42959371c9d4SSatish Balay struct IJCompare {
4296d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4297d71ae5a4SJacob Faibussowitsch   {
42980b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
42990b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
43007e8381f9SStefano Zampini     return false;
43017e8381f9SStefano Zampini   }
43027e8381f9SStefano Zampini };
43037e8381f9SStefano Zampini 
430466976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4305d71ae5a4SJacob Faibussowitsch {
4306a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4307a49f1ed0SStefano Zampini 
4308a49f1ed0SStefano Zampini   PetscFunctionBegin;
4309a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43103ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4311a49f1ed0SStefano Zampini   if (destroy) {
43129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4313a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4314a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4315a49f1ed0SStefano Zampini   }
43161a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4318a49f1ed0SStefano Zampini }
4319a49f1ed0SStefano Zampini 
432049abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4321d71ae5a4SJacob Faibussowitsch {
432249abdd8aSBarry Smith   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
43234d86920dSPierre Jolivet 
43247e8381f9SStefano Zampini   PetscFunctionBegin;
43252c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43262c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43272c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43297e8381f9SStefano Zampini }
4330ed502f03SStefano Zampini 
433166976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4332d71ae5a4SJacob Faibussowitsch {
43332c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43342c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43352c4ab24aSJunchao Zhang   PetscInt            *i, *j;
433603e76207SPierre Jolivet   PetscContainer       container_h;
43372c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4338219fbbafSJunchao Zhang 
4339219fbbafSJunchao Zhang   PetscFunctionBegin;
43409566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43412c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43422c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43432c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43442c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43452c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43462c4ab24aSJunchao Zhang   } else {
43472c4ab24aSJunchao Zhang     i = coo_i;
43482c4ab24aSJunchao Zhang     j = coo_j;
4349219fbbafSJunchao Zhang   }
4350219fbbafSJunchao Zhang 
43512c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43522c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4353cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43542c4ab24aSJunchao Zhang   // Create the GPU memory
43559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43562c4ab24aSJunchao Zhang 
43572c4ab24aSJunchao Zhang   // Copy the COO struct to device
43582c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
43592c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
43602c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
43612c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
43622c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
43632c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
43642c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
43652c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
43662c4ab24aSJunchao Zhang 
43672c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
436803e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
43693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4370219fbbafSJunchao Zhang }
4371219fbbafSJunchao Zhang 
4372d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4373d71ae5a4SJacob Faibussowitsch {
4374219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4375219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4376b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4377b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4378b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4379b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4380b6c38306SJunchao Zhang   }
4381219fbbafSJunchao Zhang }
4382219fbbafSJunchao Zhang 
438366976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4384d71ae5a4SJacob Faibussowitsch {
4385219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4386219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4387219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4388219fbbafSJunchao Zhang   PetscMemType         memtype;
4389219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4390219fbbafSJunchao Zhang   PetscScalar         *Aa;
43912c4ab24aSJunchao Zhang   PetscContainer       container;
43922c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4393219fbbafSJunchao Zhang 
4394219fbbafSJunchao Zhang   PetscFunctionBegin;
43952c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
43962c4ab24aSJunchao Zhang 
43972c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
43982c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
43992c4ab24aSJunchao Zhang 
44009566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4401219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44022c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
44032c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4404219fbbafSJunchao Zhang   }
4405219fbbafSJunchao Zhang 
44069566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44079566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4408219fbbafSJunchao Zhang 
440908bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4410cbc6b225SStefano Zampini   if (Annz) {
44116497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4413cbc6b225SStefano Zampini   }
441408bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4415219fbbafSJunchao Zhang 
44169566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44179566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4418219fbbafSJunchao Zhang 
44199566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4421219fbbafSJunchao Zhang }
4422219fbbafSJunchao Zhang 
44235b7e41feSStefano Zampini /*@C
44242ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44255b7e41feSStefano Zampini 
44262ef1f0ffSBarry Smith   Not Collective
44275b7e41feSStefano Zampini 
44285b7e41feSStefano Zampini   Input Parameters:
44295b7e41feSStefano Zampini + A          - the matrix
443011a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44315b7e41feSStefano Zampini 
44325b7e41feSStefano Zampini   Output Parameters:
443353220ed8SBarry Smith + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
443453220ed8SBarry Smith - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
44355b7e41feSStefano Zampini 
44365b7e41feSStefano Zampini   Level: developer
44375b7e41feSStefano Zampini 
443811a5261eSBarry Smith   Note:
44395b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44405b7e41feSStefano Zampini 
44411cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44425b7e41feSStefano Zampini @*/
4443d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4444d71ae5a4SJacob Faibussowitsch {
44455f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44465f101d05SStefano Zampini   CsrMatrix          *csr;
44475f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44485f101d05SStefano Zampini 
44495f101d05SStefano Zampini   PetscFunctionBegin;
44505f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44513ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44525f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4453aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
445528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44565f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44575f101d05SStefano Zampini   if (i) {
44585f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44595f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44605f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44615f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44629566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
44635f101d05SStefano Zampini       }
44645f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
44655f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
44665f101d05SStefano Zampini   }
44675f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
44683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44695f101d05SStefano Zampini }
44705f101d05SStefano Zampini 
44715b7e41feSStefano Zampini /*@C
44722ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
44735b7e41feSStefano Zampini 
44742ef1f0ffSBarry Smith   Not Collective
44755b7e41feSStefano Zampini 
44765b7e41feSStefano Zampini   Input Parameters:
44775b7e41feSStefano Zampini + A          - the matrix
44782ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
447920f4b53cSBarry Smith . i          - the CSR row pointers
448020f4b53cSBarry Smith - j          - the CSR column indices
44815b7e41feSStefano Zampini 
44825b7e41feSStefano Zampini   Level: developer
44835b7e41feSStefano Zampini 
44841cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
44855b7e41feSStefano Zampini @*/
448620f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4487d71ae5a4SJacob Faibussowitsch {
44885f101d05SStefano Zampini   PetscFunctionBegin;
44895f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44905f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44915f101d05SStefano Zampini   if (i) *i = NULL;
44925f101d05SStefano Zampini   if (j) *j = NULL;
449320f4b53cSBarry Smith   (void)compressed;
44943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44955f101d05SStefano Zampini }
44965f101d05SStefano Zampini 
44975b7e41feSStefano Zampini /*@C
449853220ed8SBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
44995b7e41feSStefano Zampini 
45005b7e41feSStefano Zampini   Not Collective
45015b7e41feSStefano Zampini 
45025b7e41feSStefano Zampini   Input Parameter:
450311a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45045b7e41feSStefano Zampini 
45055b7e41feSStefano Zampini   Output Parameter:
45065b7e41feSStefano Zampini . a - pointer to the device data
45075b7e41feSStefano Zampini 
45085b7e41feSStefano Zampini   Level: developer
45095b7e41feSStefano Zampini 
451011a5261eSBarry Smith   Note:
451153220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
45125b7e41feSStefano Zampini 
45131cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45145b7e41feSStefano Zampini @*/
4515d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4516d71ae5a4SJacob Faibussowitsch {
4517ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4518ed502f03SStefano Zampini   CsrMatrix          *csr;
4519ed502f03SStefano Zampini 
4520ed502f03SStefano Zampini   PetscFunctionBegin;
4521ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45224f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4523ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4524aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
452628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4527ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
452828b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4529ed502f03SStefano Zampini   *a = csr->values->data().get();
45303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4531ed502f03SStefano Zampini }
4532ed502f03SStefano Zampini 
45335b7e41feSStefano Zampini /*@C
453411a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45355b7e41feSStefano Zampini 
45365b7e41feSStefano Zampini   Not Collective
45375b7e41feSStefano Zampini 
45382ef1f0ffSBarry Smith   Input Parameters:
45392ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45402ef1f0ffSBarry Smith - a - pointer to the device data
45415b7e41feSStefano Zampini 
45425b7e41feSStefano Zampini   Level: developer
45435b7e41feSStefano Zampini 
45441cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45455b7e41feSStefano Zampini @*/
4546d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4547d71ae5a4SJacob Faibussowitsch {
4548ed502f03SStefano Zampini   PetscFunctionBegin;
4549ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45504f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4551ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4552ed502f03SStefano Zampini   *a = NULL;
45533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4554ed502f03SStefano Zampini }
4555ed502f03SStefano Zampini 
45565b7e41feSStefano Zampini /*@C
455711a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45585b7e41feSStefano Zampini 
45595b7e41feSStefano Zampini   Not Collective
45605b7e41feSStefano Zampini 
45615b7e41feSStefano Zampini   Input Parameter:
456211a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45635b7e41feSStefano Zampini 
45645b7e41feSStefano Zampini   Output Parameter:
45655b7e41feSStefano Zampini . a - pointer to the device data
45665b7e41feSStefano Zampini 
45675b7e41feSStefano Zampini   Level: developer
45685b7e41feSStefano Zampini 
456911a5261eSBarry Smith   Note:
457053220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
45715b7e41feSStefano Zampini 
45721cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
45735b7e41feSStefano Zampini @*/
4574d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4575d71ae5a4SJacob Faibussowitsch {
4576039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4577039c6fbaSStefano Zampini   CsrMatrix          *csr;
4578039c6fbaSStefano Zampini 
4579039c6fbaSStefano Zampini   PetscFunctionBegin;
4580039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45814f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4582039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4583aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
458528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4586039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
458728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4588039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4589039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
45913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4592039c6fbaSStefano Zampini }
45935b7e41feSStefano Zampini /*@C
459411a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4595039c6fbaSStefano Zampini 
45965b7e41feSStefano Zampini   Not Collective
45975b7e41feSStefano Zampini 
45982ef1f0ffSBarry Smith   Input Parameters:
45992ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46002ef1f0ffSBarry Smith - a - pointer to the device data
46015b7e41feSStefano Zampini 
46025b7e41feSStefano Zampini   Level: developer
46035b7e41feSStefano Zampini 
46041cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46055b7e41feSStefano Zampini @*/
4606d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4607d71ae5a4SJacob Faibussowitsch {
4608039c6fbaSStefano Zampini   PetscFunctionBegin;
4609039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46104f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4611039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46139566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4614039c6fbaSStefano Zampini   *a = NULL;
46153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4616039c6fbaSStefano Zampini }
4617039c6fbaSStefano Zampini 
46185b7e41feSStefano Zampini /*@C
461911a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46205b7e41feSStefano Zampini 
46215b7e41feSStefano Zampini   Not Collective
46225b7e41feSStefano Zampini 
46235b7e41feSStefano Zampini   Input Parameter:
462411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46255b7e41feSStefano Zampini 
46265b7e41feSStefano Zampini   Output Parameter:
46275b7e41feSStefano Zampini . a - pointer to the device data
46285b7e41feSStefano Zampini 
46295b7e41feSStefano Zampini   Level: developer
46305b7e41feSStefano Zampini 
463111a5261eSBarry Smith   Note:
463253220ed8SBarry Smith   Does not trigger any host to device copies.
463353220ed8SBarry Smith 
463453220ed8SBarry Smith   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
46355b7e41feSStefano Zampini 
46361cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46375b7e41feSStefano Zampini @*/
4638d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4639d71ae5a4SJacob Faibussowitsch {
4640ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4641ed502f03SStefano Zampini   CsrMatrix          *csr;
4642ed502f03SStefano Zampini 
4643ed502f03SStefano Zampini   PetscFunctionBegin;
4644ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46454f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4646ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4647aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
464828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4649ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
465028b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4651ed502f03SStefano Zampini   *a             = csr->values->data().get();
4652039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46539566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4655ed502f03SStefano Zampini }
4656ed502f03SStefano Zampini 
46575b7e41feSStefano Zampini /*@C
465811a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46595b7e41feSStefano Zampini 
46605b7e41feSStefano Zampini   Not Collective
46615b7e41feSStefano Zampini 
46622ef1f0ffSBarry Smith   Input Parameters:
46632ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46642ef1f0ffSBarry Smith - a - pointer to the device data
46655b7e41feSStefano Zampini 
46665b7e41feSStefano Zampini   Level: developer
46675b7e41feSStefano Zampini 
46681cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
46695b7e41feSStefano Zampini @*/
4670d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4671d71ae5a4SJacob Faibussowitsch {
4672ed502f03SStefano Zampini   PetscFunctionBegin;
4673ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46744f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4675ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46779566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4678ed502f03SStefano Zampini   *a = NULL;
46793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4680ed502f03SStefano Zampini }
4681ed502f03SStefano Zampini 
46829371c9d4SSatish Balay struct IJCompare4 {
4683d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4684d71ae5a4SJacob Faibussowitsch   {
46850b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
46860b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4687ed502f03SStefano Zampini     return false;
4688ed502f03SStefano Zampini   }
4689ed502f03SStefano Zampini };
4690ed502f03SStefano Zampini 
46919371c9d4SSatish Balay struct Shift {
4692ed502f03SStefano Zampini   int _shift;
4693ed502f03SStefano Zampini 
4694ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
46959371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4696ed502f03SStefano Zampini };
4697ed502f03SStefano Zampini 
469821afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4699d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4700d71ae5a4SJacob Faibussowitsch {
4701ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4702ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4703ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4704ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4705ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4706ed502f03SStefano Zampini   cusparseStatus_t              stat;
4707ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4708ed502f03SStefano Zampini 
4709ed502f03SStefano Zampini   PetscFunctionBegin;
4710ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4711ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47124f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4713ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4714ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47155f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
471608401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4717aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4718aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4719ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4720ed502f03SStefano Zampini     m = A->rmap->n;
4721ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47229566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47239566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47249566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4725ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4726ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4727ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4728ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4729ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4730ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4731ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4732ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4733ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4734ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4735ed502f03SStefano Zampini     Ccusp->nrows            = m;
4736ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4737ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4738ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4739ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47409566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47419566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47429566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4743f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4744f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4745f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47509566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
475128b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
475228b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4753ed502f03SStefano Zampini 
4754ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4755ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4756ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4757ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4758ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4759ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4760ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4761ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4762ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
47632c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4764ed502f03SStefano Zampini     if (c->nz) {
47652ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
47662ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
47672ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
47682ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
47692ed87e7eSStefano Zampini 
4770ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4771ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4772ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4773ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
47749566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4775ed502f03SStefano Zampini         }
47762ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
47772ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4778ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4779ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4780ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4781ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
47829566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4783ed502f03SStefano Zampini         }
47842ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
47852ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
47869566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
47879371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47889371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47899371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47909371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47912ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
47922ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
47932ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
47948909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4795ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4796ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
47978909a122SStefano Zampini #else
47988909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
47998909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48008909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48018909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48028909a122SStefano Zampini #endif
48032ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48042ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48052ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48062ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48072ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48082ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48092c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48102c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4811*29d3d2f8SNuno Nobre #if CCCL_VERSION >= 3001000
4812*29d3d2f8SNuno Nobre       cuda::std::advance(p2, Annz);
4813*29d3d2f8SNuno Nobre #else
4814ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4815*29d3d2f8SNuno Nobre #endif
4816792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48178909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48188909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48198909a122SStefano Zampini #endif
48202ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48212ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48222ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4823792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48242ed87e7eSStefano Zampini #else
482559c3d2bbSPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
48262ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
482759c3d2bbSPierre Jolivet   #else
482859c3d2bbSPierre Jolivet       auto pred = cuda::std::identity();
482959c3d2bbSPierre Jolivet   #endif
4830792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4831792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48322ed87e7eSStefano Zampini #endif
48339371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48349371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48359566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48362ed87e7eSStefano Zampini       delete wPerm;
48372ed87e7eSStefano Zampini       delete Acoo;
48382ed87e7eSStefano Zampini       delete Bcoo;
48392ed87e7eSStefano Zampini       delete Ccoo;
4840ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48419371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48429371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4843ed502f03SStefano Zampini #endif
48441a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48459566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48469566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4847ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4848ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4849ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4850ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4851ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4852ed502f03SStefano Zampini 
48531a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48541a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4855a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4856ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4857ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4858ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4859ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4860ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4861ed502f03SStefano Zampini 
4862ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4863ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4864ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4865ed502f03SStefano Zampini 
48669566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4867ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4868ed502f03SStefano Zampini         if (AT) {
4869ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4870*29d3d2f8SNuno Nobre #if CCCL_VERSION >= 3001000
4871*29d3d2f8SNuno Nobre           cuda::std::advance(rT, -1);
4872*29d3d2f8SNuno Nobre #else
4873ed502f03SStefano Zampini           thrust::advance(rT, -1);
4874*29d3d2f8SNuno Nobre #endif
4875ed502f03SStefano Zampini         }
4876ed502f03SStefano Zampini         if (BT) {
4877ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4878ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4879ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4880ed502f03SStefano Zampini         }
4881ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4882ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4883ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4884ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4885ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4886ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48879566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4888ed502f03SStefano Zampini 
48899566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
48909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
48919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4892f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4893f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4894f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
48959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4898ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48999371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
49009371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4901ed502f03SStefano Zampini #endif
4902ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4903ed502f03SStefano Zampini       }
4904ed502f03SStefano Zampini     }
4905ed502f03SStefano Zampini 
4906ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
49079f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
49089f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4909ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
49107de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4911ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4912ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4913ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4914ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49159566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49169566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917ed502f03SStefano Zampini     } else {
49189566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49199566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4920ed502f03SStefano Zampini     }
49219566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49229566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49239566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4924ed502f03SStefano Zampini     c->maxnz         = c->nz;
4925ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4926ed502f03SStefano Zampini     c->rmax          = 0;
4927ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4928ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4929ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4930ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4931ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4932ed502f03SStefano Zampini     }
49339566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
49349566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4935ed502f03SStefano Zampini     (*C)->nonzerostate++;
49369566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49379566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4938ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4939ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4940ed502f03SStefano Zampini   } else {
494108401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4942ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4943ed502f03SStefano Zampini     if (c->nz) {
4944ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49452c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4946aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
494708401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49489566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49499566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49505f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49515f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4952ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4953ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4954ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4955aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4956aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4957aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4958aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49592c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49602c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4961*29d3d2f8SNuno Nobre #if CCCL_VERSION >= 3001000
4962*29d3d2f8SNuno Nobre       cuda::std::advance(pmid, Acsr->num_entries);
4963*29d3d2f8SNuno Nobre #else
4964ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
4965*29d3d2f8SNuno Nobre #endif
49669566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49672c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49689371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4969ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49709371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49712c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4972ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49739566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49741a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49755f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4976ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4977ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4978ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4979ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4980ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4981ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4982ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49831a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4984ed502f03SStefano Zampini       }
49859566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4986ed502f03SStefano Zampini     }
4987ed502f03SStefano Zampini   }
49889566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4989ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4990ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4991ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
49923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4993ed502f03SStefano Zampini }
4994c215019aSStefano Zampini 
4995d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4996d71ae5a4SJacob Faibussowitsch {
4997c215019aSStefano Zampini   bool               dmem;
4998c215019aSStefano Zampini   const PetscScalar *av;
4999c215019aSStefano Zampini 
5000c215019aSStefano Zampini   PetscFunctionBegin;
5001c215019aSStefano Zampini   dmem = isCudaMem(v);
50029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5003c215019aSStefano Zampini   if (n && idx) {
5004c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5005c215019aSStefano Zampini     widx.assign(idx, idx + n);
50069566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5007c215019aSStefano Zampini 
5008c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5009c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5010c215019aSStefano Zampini     if (dmem) {
5011c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5012c215019aSStefano Zampini     } else {
5013c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5014c215019aSStefano Zampini       dv = w->data();
5015c215019aSStefano Zampini     }
5016c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5017c215019aSStefano Zampini 
5018c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5019c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5020c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
502148a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5022c215019aSStefano Zampini     delete w;
5023c215019aSStefano Zampini   } else {
50249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5025c215019aSStefano Zampini   }
50269566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5029c215019aSStefano Zampini }
5030