xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision cc1eb50d5a4d6061e906552df09a79d2d9d16af2)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14
18d0967f54SJacob Faibussowitsch #endif
19a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
20a2cee5feSJed Brown #include <thrust/remove.h>
21a2cee5feSJed Brown #include <thrust/sort.h>
22a2cee5feSJed Brown #include <thrust/unique.h>
2359c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
2459c3d2bbSPierre Jolivet   #include <cuda/std/functional>
2559c3d2bbSPierre Jolivet #endif
26e8d2b73aSMark Adams 
27e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2946aba097SBarry Smith /*
3046aba097SBarry Smith   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31afb2bd1cSJunchao Zhang   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32afb2bd1cSJunchao Zhang */
33afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36afb2bd1cSJunchao Zhang #endif
379ae82921SPaul Mullowney 
38087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
416fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
436fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
466fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48d460d7bfSJunchao Zhang #endif
49ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
5133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
526fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
536fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
546fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
599ae82921SPaul Mullowney 
607f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
632c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
647f756511SDominic Meiser 
6557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
6757181aedSStefano Zampini 
68c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71c215019aSStefano Zampini 
72d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73d71ae5a4SJacob Faibussowitsch {
74aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
756e111a19SKarl Rupp 
76ca45077fSPaul Mullowney   PetscFunctionBegin;
77ca45077fSPaul Mullowney   switch (op) {
78d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
79d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
80d71ae5a4SJacob Faibussowitsch     break;
81d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
82d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
83d71ae5a4SJacob Faibussowitsch     break;
84d71ae5a4SJacob Faibussowitsch   default:
85d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86ca45077fSPaul Mullowney   }
873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
88ca45077fSPaul Mullowney }
899ae82921SPaul Mullowney 
90e057df02SPaul Mullowney /*@
9111a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
9211a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
9311a5261eSBarry Smith 
94e057df02SPaul Mullowney   Not Collective
95e057df02SPaul Mullowney 
96e057df02SPaul Mullowney   Input Parameters:
9711a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
982ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
992ef1f0ffSBarry Smith            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
10011a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101e057df02SPaul Mullowney 
102e057df02SPaul Mullowney   Level: intermediate
103e057df02SPaul Mullowney 
104fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105e057df02SPaul Mullowney @*/
106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107d71ae5a4SJacob Faibussowitsch {
108e057df02SPaul Mullowney   PetscFunctionBegin;
109e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112e057df02SPaul Mullowney }
113e057df02SPaul Mullowney 
114d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115d71ae5a4SJacob Faibussowitsch {
116365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117365b711fSMark Adams 
118365b711fSMark Adams   PetscFunctionBegin;
119365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
121365b711fSMark Adams }
122365b711fSMark Adams 
123365b711fSMark Adams /*@
12411a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125365b711fSMark Adams 
126365b711fSMark Adams   Input Parameters:
12711a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
12811a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
129365b711fSMark Adams 
1302ef1f0ffSBarry Smith   Level: intermediate
131365b711fSMark Adams 
13211a5261eSBarry Smith   Note:
13353220ed8SBarry Smith   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
13453220ed8SBarry Smith   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136365b711fSMark Adams 
1371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138365b711fSMark Adams @*/
139d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140d71ae5a4SJacob Faibussowitsch {
141365b711fSMark Adams   PetscFunctionBegin;
142365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145365b711fSMark Adams }
146365b711fSMark Adams 
14766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148d71ae5a4SJacob Faibussowitsch {
149e6e9a74fSStefano Zampini   PetscFunctionBegin;
1501a2c6b5cSJunchao Zhang   switch (op) {
1511a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1521a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1539566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1541a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1551a2c6b5cSJunchao Zhang     break;
156d71ae5a4SJacob Faibussowitsch   default:
157d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158d71ae5a4SJacob Faibussowitsch     break;
159e6e9a74fSStefano Zampini   }
1603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
161e6e9a74fSStefano Zampini }
162e6e9a74fSStefano Zampini 
163ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164d71ae5a4SJacob Faibussowitsch {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1669ae82921SPaul Mullowney   PetscBool                flg;
167a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1686e111a19SKarl Rupp 
1699ae82921SPaul Mullowney   PetscFunctionBegin;
170d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1719ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1729371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1739566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174afb2bd1cSJunchao Zhang 
1759371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1769566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
1779566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
1789566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1809371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184a435da06SStefano Zampini   #else
185aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186a435da06SStefano Zampini   #endif
1879371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189afb2bd1cSJunchao Zhang 
1909371c9d4SSatish Balay     PetscCall(
1919371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193afb2bd1cSJunchao Zhang #endif
1944c87dfd4SPaul Mullowney   }
195d0609cedSBarry Smith   PetscOptionsHeadEnd();
1963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
199b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201d460d7bfSJunchao Zhang {
202d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
204d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
206d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
207d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
208d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
209d460d7bfSJunchao Zhang 
210d460d7bfSJunchao Zhang   PetscFunctionBegin;
211d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
212d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
213d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
214d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
215d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
216d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
217d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
218d460d7bfSJunchao Zhang       Mi[0] = 0;
219d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
220d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
221d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
222d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
223d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
224d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
225d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
226d460d7bfSJunchao Zhang       }
227d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
228f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
229f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
230f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
231f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
232f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
233d460d7bfSJunchao Zhang 
234d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
235d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
236d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
237d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
238d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
239d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
240d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
241d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
242d460d7bfSJunchao Zhang 
243d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
244d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
245d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
246d460d7bfSJunchao Zhang 
247d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
248d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
249d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
250d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
251d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
252d460d7bfSJunchao Zhang 
253d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
254f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
256d460d7bfSJunchao Zhang 
257d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
258d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
259d460d7bfSJunchao Zhang 
260d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
261d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
262d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
263d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
264d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
265d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
266d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
267d460d7bfSJunchao Zhang 
268d460d7bfSJunchao Zhang       // Record for reuse
269d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
270d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
271d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
272d460d7bfSJunchao Zhang     }
273d460d7bfSJunchao Zhang     // Copy the value
274d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
275d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
276d460d7bfSJunchao Zhang     Mnz = Mi[m];
277d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
278d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
279d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
280d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
281d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
282d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
283d460d7bfSJunchao Zhang     }
284d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
285d460d7bfSJunchao Zhang 
286204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
287204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
288204a0e31SJunchao Zhang       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
289204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
290204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291204a0e31SJunchao Zhang     } else
292204a0e31SJunchao Zhang   #endif
293204a0e31SJunchao Zhang     {
294d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
295d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
296d460d7bfSJunchao Zhang 
297d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
298204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis          = PETSC_TRUE;
299d460d7bfSJunchao Zhang       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
300d460d7bfSJunchao Zhang     }
301204a0e31SJunchao Zhang   }
302d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
303d460d7bfSJunchao Zhang }
304d460d7bfSJunchao Zhang #else
305d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
306d71ae5a4SJacob Faibussowitsch {
3079ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3089ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
310aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3119ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3129ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3139ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3149ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3159ae82921SPaul Mullowney 
3169ae82921SPaul Mullowney   PetscFunctionBegin;
3173ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
318c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3199ae82921SPaul Mullowney     try {
3209ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3219ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
322da79fbbcSStefano Zampini       if (!loTriFactor) {
3232cbc15d9SMark         PetscScalar *AALo;
3242cbc15d9SMark 
3259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3269ae82921SPaul Mullowney 
3279ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3309ae82921SPaul Mullowney 
3319ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3329ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3339ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3349ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3359ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3369ae82921SPaul Mullowney         v         = aa;
3379ae82921SPaul Mullowney         vi        = aj;
3389ae82921SPaul Mullowney         offset    = 1;
3399ae82921SPaul Mullowney         rowOffset = 1;
3409ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3419ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
342e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3439ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3449ae82921SPaul Mullowney           rowOffset += nz + 1;
3459ae82921SPaul Mullowney 
346f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
347f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3489ae82921SPaul Mullowney 
3499ae82921SPaul Mullowney           offset += nz;
3509ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3519ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3529ae82921SPaul Mullowney           offset += 1;
3539ae82921SPaul Mullowney 
3549ae82921SPaul Mullowney           v += nz;
3559ae82921SPaul Mullowney           vi += nz;
3569ae82921SPaul Mullowney         }
3572205254eSKarl Rupp 
358aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3599566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
360da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
361aa372e3fSPaul Mullowney         /* Create the matrix description */
3629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3641b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
366afb2bd1cSJunchao Zhang   #else
3679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
368afb2bd1cSJunchao Zhang   #endif
3699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
371aa372e3fSPaul Mullowney 
372aa372e3fSPaul Mullowney         /* set the operation */
373aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
374aa372e3fSPaul Mullowney 
375aa372e3fSPaul Mullowney         /* set the matrix */
376aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
377aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
378aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
379aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
380aa372e3fSPaul Mullowney 
381aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
382aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
383aa372e3fSPaul Mullowney 
384aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
385aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
386aa372e3fSPaul Mullowney 
387aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
388aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
389aa372e3fSPaul Mullowney 
390afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3919566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
392261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3931b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3949371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3959371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
397afb2bd1cSJunchao Zhang   #endif
398afb2bd1cSJunchao Zhang 
399aa372e3fSPaul Mullowney         /* perform the solve analysis */
4009371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4019f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4029566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4039566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
404aa372e3fSPaul Mullowney 
405da79fbbcSStefano Zampini         /* assign the pointer */
406aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4072cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4089566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4099566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4109566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
411da79fbbcSStefano Zampini       } else { /* update values only */
41248a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
413da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4142cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
415da79fbbcSStefano Zampini         v                    = aa;
416da79fbbcSStefano Zampini         vi                   = aj;
417da79fbbcSStefano Zampini         offset               = 1;
418da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
419da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
420f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
421da79fbbcSStefano Zampini           offset += nz;
4222cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
423da79fbbcSStefano Zampini           offset += 1;
424da79fbbcSStefano Zampini           v += nz;
425da79fbbcSStefano Zampini         }
4262cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4279566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
428da79fbbcSStefano Zampini       }
429d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
430d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
431d71ae5a4SJacob Faibussowitsch     }
4329ae82921SPaul Mullowney   }
4333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4349ae82921SPaul Mullowney }
4359ae82921SPaul Mullowney 
436d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
437d71ae5a4SJacob Faibussowitsch {
4389ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4399ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4409ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
441aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4429ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4439ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4449ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4459ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4469ae82921SPaul Mullowney 
4479ae82921SPaul Mullowney   PetscFunctionBegin;
4483ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
449c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4509ae82921SPaul Mullowney     try {
4519ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4529ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
453da79fbbcSStefano Zampini       if (!upTriFactor) {
4542cbc15d9SMark         PetscScalar *AAUp;
4552cbc15d9SMark 
4569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4572cbc15d9SMark 
4589ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4599566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4639ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4649ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4659ae82921SPaul Mullowney         offset  = nzUpper;
4669ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4679ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4689ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4699ae82921SPaul Mullowney 
470e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4719ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4729ae82921SPaul Mullowney 
473e057df02SPaul Mullowney           /* decrement the offset */
4749ae82921SPaul Mullowney           offset -= (nz + 1);
4759ae82921SPaul Mullowney 
476e057df02SPaul Mullowney           /* first, set the diagonal elements */
4779ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
47809f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4799ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4809ae82921SPaul Mullowney 
481f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
482f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
4839ae82921SPaul Mullowney         }
4842205254eSKarl Rupp 
485aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4869566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
487da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4882205254eSKarl Rupp 
489aa372e3fSPaul Mullowney         /* Create the matrix description */
4909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4921b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
494afb2bd1cSJunchao Zhang   #else
4959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
496afb2bd1cSJunchao Zhang   #endif
4979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4989566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
499aa372e3fSPaul Mullowney 
500aa372e3fSPaul Mullowney         /* set the operation */
501aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502aa372e3fSPaul Mullowney 
503aa372e3fSPaul Mullowney         /* set the matrix */
504aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
505aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
506aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
507aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
508aa372e3fSPaul Mullowney 
509aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
510aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
511aa372e3fSPaul Mullowney 
512aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
513aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
514aa372e3fSPaul Mullowney 
515aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
516aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
517aa372e3fSPaul Mullowney 
518afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5199566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
520261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5211b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5229371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5239371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
525afb2bd1cSJunchao Zhang   #endif
526afb2bd1cSJunchao Zhang 
527aa372e3fSPaul Mullowney         /* perform the solve analysis */
5289371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5299f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5309f7ba44dSJacob Faibussowitsch 
5319566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5329566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
533aa372e3fSPaul Mullowney 
534da79fbbcSStefano Zampini         /* assign the pointer */
535aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5362cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5379566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5389566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5399566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
540da79fbbcSStefano Zampini       } else {
54148a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
542da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
543da79fbbcSStefano Zampini         offset = nzUpper;
544da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
545da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
546da79fbbcSStefano Zampini 
547da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
548da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
549da79fbbcSStefano Zampini 
550da79fbbcSStefano Zampini           /* decrement the offset */
551da79fbbcSStefano Zampini           offset -= (nz + 1);
552da79fbbcSStefano Zampini 
553da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5542cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
555f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
556da79fbbcSStefano Zampini         }
5572cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5589566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
559da79fbbcSStefano Zampini       }
560d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
561d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
562d71ae5a4SJacob Faibussowitsch     }
5639ae82921SPaul Mullowney   }
5643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5659ae82921SPaul Mullowney }
566d460d7bfSJunchao Zhang #endif
5679ae82921SPaul Mullowney 
568d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
569d71ae5a4SJacob Faibussowitsch {
5709ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5719ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572c9e33d71SJunchao Zhang   IS                            isrow = a->row, isicol = a->icol;
5739ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5749ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5759ae82921SPaul Mullowney 
5769ae82921SPaul Mullowney   PetscFunctionBegin;
57728b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
578b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
579d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
580d460d7bfSJunchao Zhang #else
5819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
583ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
584d460d7bfSJunchao Zhang #endif
585d460d7bfSJunchao Zhang 
586aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5879ae82921SPaul Mullowney 
588d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
589e057df02SPaul Mullowney   /* lower triangular indices */
5909566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
591da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
592da79fbbcSStefano Zampini     const PetscInt *r;
593da79fbbcSStefano Zampini 
5949566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
595aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
596aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5979566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5989566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
599da79fbbcSStefano Zampini   }
6009ae82921SPaul Mullowney 
601e057df02SPaul Mullowney   /* upper triangular indices */
602c9e33d71SJunchao Zhang   PetscCall(ISIdentity(isicol, &col_identity));
603da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
604da79fbbcSStefano Zampini     const PetscInt *c;
605da79fbbcSStefano Zampini 
606c9e33d71SJunchao Zhang     PetscCall(ISGetIndices(isicol, &c));
607aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
608aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
609c9e33d71SJunchao Zhang     PetscCall(ISRestoreIndices(isicol, &c));
6109566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
611da79fbbcSStefano Zampini   }
6123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6139ae82921SPaul Mullowney }
6149ae82921SPaul Mullowney 
615b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
616d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
617d460d7bfSJunchao Zhang {
618d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
619d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
620d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
621d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
622d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
623d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
624d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
625d460d7bfSJunchao Zhang 
626d460d7bfSJunchao Zhang   PetscFunctionBegin;
627d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
628d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
629d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
630d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
631d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
632d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
633d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
634d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
635d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
636d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
637d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
638d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
639d460d7bfSJunchao Zhang       }
640d460d7bfSJunchao Zhang       // Copy M (U) from host to device
641f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
642f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
643f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
644f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
645d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
646d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
647d460d7bfSJunchao Zhang 
648d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
649d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
650d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
651d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
652d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
653d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
654d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
655d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
656d460d7bfSJunchao Zhang 
657d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
658d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
659d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
660d460d7bfSJunchao Zhang 
661d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
662f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
663f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
664d460d7bfSJunchao Zhang 
665d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
666d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
667d460d7bfSJunchao Zhang 
668d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
669d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
670d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
671d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
672d460d7bfSJunchao Zhang 
673aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
674d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
675d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
676d460d7bfSJunchao Zhang 
677d460d7bfSJunchao Zhang       // Record for reuse
678d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
679d460d7bfSJunchao Zhang       fs->diag_h   = D;
680d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
681d460d7bfSJunchao Zhang     }
682d460d7bfSJunchao Zhang     // Copy the value
683d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
684d460d7bfSJunchao Zhang     D   = fs->diag_h;
685d460d7bfSJunchao Zhang     Mnz = Ai[m];
686d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
687d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
688d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
689d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
690d460d7bfSJunchao Zhang     }
691d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
692d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
693d460d7bfSJunchao Zhang 
694204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
695204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) {
696204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
697204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
698204a0e31SJunchao Zhang     } else
699204a0e31SJunchao Zhang   #endif
700204a0e31SJunchao Zhang     {
701d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
702d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
703d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
704204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis = PETSC_TRUE;
705204a0e31SJunchao Zhang     }
706d460d7bfSJunchao Zhang   }
707d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
708d460d7bfSJunchao Zhang }
709d460d7bfSJunchao Zhang 
710d460d7bfSJunchao Zhang // Solve Ut D U x = b
711d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
712d460d7bfSJunchao Zhang {
713d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
714d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
715d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
716d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
717d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
718d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
719d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
720d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
721d460d7bfSJunchao Zhang 
722d460d7bfSJunchao Zhang   PetscFunctionBegin;
723d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
724d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
725d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
726d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
727d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
728d460d7bfSJunchao Zhang 
729d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
730d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
731d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
732d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
733d460d7bfSJunchao Zhang   } else {
734d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
735d460d7bfSJunchao Zhang   }
736d460d7bfSJunchao Zhang 
737d460d7bfSJunchao Zhang   // Solve Ut Y = X
738d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
739d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
740d460d7bfSJunchao Zhang 
741d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
742d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
743d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
744d460d7bfSJunchao Zhang 
745d460d7bfSJunchao Zhang   // Solve U X = Y
746d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
747d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
748d460d7bfSJunchao Zhang   } else {
749d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
750d460d7bfSJunchao Zhang   }
751d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
752d460d7bfSJunchao Zhang 
753d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
754d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
755d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
756d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
757d460d7bfSJunchao Zhang   }
758d460d7bfSJunchao Zhang 
759d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
760d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
761d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
762d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
763d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
764d460d7bfSJunchao Zhang }
765d460d7bfSJunchao Zhang #else
766d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
767d71ae5a4SJacob Faibussowitsch {
768087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
769087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
770aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
771aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
772087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
773087f3262SPaul Mullowney   PetscScalar                       *AAUp;
774087f3262SPaul Mullowney   PetscScalar                       *AALo;
775087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
776087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
777087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
778087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
779087f3262SPaul Mullowney 
780087f3262SPaul Mullowney   PetscFunctionBegin;
7813ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
782c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
783087f3262SPaul Mullowney     try {
7849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7859566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
786da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
787087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7899566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
790087f3262SPaul Mullowney 
791087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
792087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
793087f3262SPaul Mullowney         AiUp[n] = nzUpper;
794087f3262SPaul Mullowney         offset  = 0;
795087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
796087f3262SPaul Mullowney           /* set the pointers */
797087f3262SPaul Mullowney           v  = aa + ai[i];
798087f3262SPaul Mullowney           vj = aj + ai[i];
799087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
800087f3262SPaul Mullowney 
801087f3262SPaul Mullowney           /* first, set the diagonal elements */
802087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
80309f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
804087f3262SPaul Mullowney           AiUp[i]      = offset;
80509f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
806087f3262SPaul Mullowney 
807087f3262SPaul Mullowney           offset += 1;
808087f3262SPaul Mullowney           if (nz > 0) {
809f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
810f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
811087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
812087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
813087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
814087f3262SPaul Mullowney             }
815087f3262SPaul Mullowney             offset += nz;
816087f3262SPaul Mullowney           }
817087f3262SPaul Mullowney         }
818087f3262SPaul Mullowney 
819aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8209566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
821da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822087f3262SPaul Mullowney 
823aa372e3fSPaul Mullowney         /* Create the matrix description */
8249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8261b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
828afb2bd1cSJunchao Zhang   #else
8299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
830afb2bd1cSJunchao Zhang   #endif
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
833087f3262SPaul Mullowney 
834aa372e3fSPaul Mullowney         /* set the matrix */
835aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
836aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
837aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
838aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
839aa372e3fSPaul Mullowney 
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
841aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
842aa372e3fSPaul Mullowney 
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
845aa372e3fSPaul Mullowney 
846aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
848aa372e3fSPaul Mullowney 
849afb2bd1cSJunchao Zhang         /* set the operation */
850afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
851afb2bd1cSJunchao Zhang 
852afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8539566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
854261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8551b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8569371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8579371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
859afb2bd1cSJunchao Zhang   #endif
860afb2bd1cSJunchao Zhang 
861aa372e3fSPaul Mullowney         /* perform the solve analysis */
8629371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8639f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8649f7ba44dSJacob Faibussowitsch 
8659566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8669566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
867aa372e3fSPaul Mullowney 
868da79fbbcSStefano Zampini         /* assign the pointer */
869aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
870aa372e3fSPaul Mullowney 
871aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8729566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
873da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
874aa372e3fSPaul Mullowney 
875aa372e3fSPaul Mullowney         /* Create the matrix description */
8769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8781b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
880afb2bd1cSJunchao Zhang   #else
8819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
882afb2bd1cSJunchao Zhang   #endif
8839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
885aa372e3fSPaul Mullowney 
886aa372e3fSPaul Mullowney         /* set the operation */
887aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
888aa372e3fSPaul Mullowney 
889aa372e3fSPaul Mullowney         /* set the matrix */
890aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
891aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
892aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
893aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
894aa372e3fSPaul Mullowney 
895aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
896aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
897aa372e3fSPaul Mullowney 
898aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
900aa372e3fSPaul Mullowney 
901aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
903aa372e3fSPaul Mullowney 
904afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9059566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
906261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9071b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9089371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9099371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9109566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
911afb2bd1cSJunchao Zhang   #endif
912afb2bd1cSJunchao Zhang 
913aa372e3fSPaul Mullowney         /* perform the solve analysis */
9149371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9159f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9169f7ba44dSJacob Faibussowitsch 
9179566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9189566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
919aa372e3fSPaul Mullowney 
920da79fbbcSStefano Zampini         /* assign the pointer */
921aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
922087f3262SPaul Mullowney 
9239566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
926da79fbbcSStefano Zampini       } else {
927da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
928da79fbbcSStefano Zampini         offset = 0;
929da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
930da79fbbcSStefano Zampini           /* set the pointers */
931da79fbbcSStefano Zampini           v  = aa + ai[i];
932da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
933da79fbbcSStefano Zampini 
934da79fbbcSStefano Zampini           /* first, set the diagonal elements */
935da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
936da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
937da79fbbcSStefano Zampini 
938da79fbbcSStefano Zampini           offset += 1;
939da79fbbcSStefano Zampini           if (nz > 0) {
940f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
941da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
942da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
943da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
944da79fbbcSStefano Zampini             }
945da79fbbcSStefano Zampini             offset += nz;
946da79fbbcSStefano Zampini           }
947da79fbbcSStefano Zampini         }
94828b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
94928b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
950da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
951da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9529566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
953da79fbbcSStefano Zampini       }
9549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
956d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
957d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
958d71ae5a4SJacob Faibussowitsch     }
959087f3262SPaul Mullowney   }
9603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
961087f3262SPaul Mullowney }
962d460d7bfSJunchao Zhang #endif
963087f3262SPaul Mullowney 
964d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
965d71ae5a4SJacob Faibussowitsch {
966087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
967087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
968087f3262SPaul Mullowney   IS                            ip                 = a->row;
969087f3262SPaul Mullowney   PetscBool                     perm_identity;
970087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
971087f3262SPaul Mullowney 
972087f3262SPaul Mullowney   PetscFunctionBegin;
97328b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
974d460d7bfSJunchao Zhang 
975b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
976d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
977d460d7bfSJunchao Zhang #else
9789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
979ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
980d460d7bfSJunchao Zhang #endif
981aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
982aa372e3fSPaul Mullowney 
983da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
984da79fbbcSStefano Zampini 
985087f3262SPaul Mullowney   /* lower triangular indices */
9869566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
987087f3262SPaul Mullowney   if (!perm_identity) {
9884e4bbfaaSStefano Zampini     IS              iip;
989da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9904e4bbfaaSStefano Zampini 
9919566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
9929566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
9939566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
994aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
995aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
996aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9974e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
9989566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
9999566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10009566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10019566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1002da79fbbcSStefano Zampini   }
10033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1004087f3262SPaul Mullowney }
1005087f3262SPaul Mullowney 
1006d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1007d71ae5a4SJacob Faibussowitsch {
1008087f3262SPaul Mullowney   PetscFunctionBegin;
10099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10109566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1011ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1012d460d7bfSJunchao Zhang 
1013b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1014d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1015d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1016d460d7bfSJunchao Zhang #else
1017087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1018d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1019d460d7bfSJunchao Zhang   IS          ip = b->row;
1020d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1021d460d7bfSJunchao Zhang 
10229566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1023087f3262SPaul Mullowney   if (perm_identity) {
1024087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1025087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1026087f3262SPaul Mullowney   } else {
1027087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1028087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1029d460d7bfSJunchao Zhang   }
1030d460d7bfSJunchao Zhang #endif
10314e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10324e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1033087f3262SPaul Mullowney 
1034087f3262SPaul Mullowney   /* get the triangular factors */
10359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1037087f3262SPaul Mullowney }
10389ae82921SPaul Mullowney 
1039b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1040d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1041d71ae5a4SJacob Faibussowitsch {
1042bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1043aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1044aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1045da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1046da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1047aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1048aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1049aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1050aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1051b175d8bbSPaul Mullowney 
1052bda325fcSPaul Mullowney   PetscFunctionBegin;
1053aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10549566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1055da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1056aa372e3fSPaul Mullowney 
1057aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1058aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1059aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10609371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1061aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1062aa372e3fSPaul Mullowney 
1063aa372e3fSPaul Mullowney   /* Create the matrix description */
10649566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10659566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10669566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* set the operation */
1071aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1072aa372e3fSPaul Mullowney 
1073aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1074aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1075afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1076afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1077aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1078afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1079afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1080afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1081aa372e3fSPaul Mullowney 
1082aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1083afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10849371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10859371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10869371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10879566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1088afb2bd1cSJunchao Zhang   #endif
1089afb2bd1cSJunchao Zhang 
10909566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10919f7ba44dSJacob Faibussowitsch   {
10929f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
10939f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
10949371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1095afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10969f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1097afb2bd1cSJunchao Zhang   #else
10989f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1099afb2bd1cSJunchao Zhang   #endif
11009f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11019f7ba44dSJacob Faibussowitsch   }
11029f7ba44dSJacob Faibussowitsch 
11039566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11049566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1105aa372e3fSPaul Mullowney 
1106afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11079566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1108261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11091b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11109371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11119371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11129566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1113afb2bd1cSJunchao Zhang   #endif
1114afb2bd1cSJunchao Zhang 
1115afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11169371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11179f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11189f7ba44dSJacob Faibussowitsch 
11199566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11209566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1121aa372e3fSPaul Mullowney 
1122da79fbbcSStefano Zampini   /* assign the pointer */
1123aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1124aa372e3fSPaul Mullowney 
1125aa372e3fSPaul Mullowney   /*********************************************/
1126aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1127aa372e3fSPaul Mullowney   /*********************************************/
1128aa372e3fSPaul Mullowney 
1129aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11309566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1131da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1132aa372e3fSPaul Mullowney 
1133aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1134aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1135aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11369371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1137aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1138aa372e3fSPaul Mullowney 
1139aa372e3fSPaul Mullowney   /* Create the matrix description */
11409566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11419566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1145aa372e3fSPaul Mullowney 
1146aa372e3fSPaul Mullowney   /* set the operation */
1147aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1148aa372e3fSPaul Mullowney 
1149aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1150aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1151afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1152afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1153aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1154afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1155afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1156afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1157aa372e3fSPaul Mullowney 
1158aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1159afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11609371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11619371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11629371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11639566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1164afb2bd1cSJunchao Zhang   #endif
1165afb2bd1cSJunchao Zhang 
11669566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11679f7ba44dSJacob Faibussowitsch   {
11689f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11699f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11709371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1171afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11729f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1173afb2bd1cSJunchao Zhang   #else
11749f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1175afb2bd1cSJunchao Zhang   #endif
11769f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11779f7ba44dSJacob Faibussowitsch   }
1178d49cd2b7SBarry Smith 
11799566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11809566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1181aa372e3fSPaul Mullowney 
1182afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1184261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11851b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11869371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11879371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11889566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1189afb2bd1cSJunchao Zhang   #endif
1190afb2bd1cSJunchao Zhang 
1191afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11925f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
11939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11949f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1195d49cd2b7SBarry Smith 
11969566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11979566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1198aa372e3fSPaul Mullowney 
1199da79fbbcSStefano Zampini   /* assign the pointer */
1200aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1202bda325fcSPaul Mullowney }
1203d460d7bfSJunchao Zhang #endif
1204bda325fcSPaul Mullowney 
12059371c9d4SSatish Balay struct PetscScalarToPetscInt {
12069371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1207a49f1ed0SStefano Zampini };
1208a49f1ed0SStefano Zampini 
1209d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1210d71ae5a4SJacob Faibussowitsch {
1211aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1212a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1213bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1214bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1215aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1216b175d8bbSPaul Mullowney 
1217bda325fcSPaul Mullowney   PetscFunctionBegin;
12189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1219a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122028b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1221a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
122208401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12233ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12249566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
122648a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1227a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1228aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12299566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1230aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12319566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12329566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1233aa372e3fSPaul Mullowney 
1234b06137fdSPaul Mullowney     /* set alpha and beta */
1235f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1236f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1237f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1241b06137fdSPaul Mullowney 
1242aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1243aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1244a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1245554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1246554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1247aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1248a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1249aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1250aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1251a3fdcf43SKarl Rupp 
1252ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
125381902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1254afb2bd1cSJunchao Zhang 
1255afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12563606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12579371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12589371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12599371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12603606e59fSJunchao Zhang   #else
12613606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12623606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12633606e59fSJunchao Zhang 
12643606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12653606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12663606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12673606e59fSJunchao Zhang         */
12683606e59fSJunchao Zhang       if (matrixT->num_entries) {
12699371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12709371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12713606e59fSJunchao Zhang 
12723606e59fSJunchao Zhang       } else {
12733606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12743606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12753606e59fSJunchao Zhang       }
12763606e59fSJunchao Zhang   #endif
1277afb2bd1cSJunchao Zhang #endif
1278aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1279afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1280afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1281afb2bd1cSJunchao Zhang #else
1282aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
128351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
128451c6d536SStefano Zampini       /* First convert HYB to CSR */
1285aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1286aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1287aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1288aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1289aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1290aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1291aa372e3fSPaul Mullowney 
12929371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
12939371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1294aa372e3fSPaul Mullowney 
1295aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1296aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1297aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1298aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1299aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1300aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1301aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1302aa372e3fSPaul Mullowney 
13039371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13049371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13059371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1306aa372e3fSPaul Mullowney 
1307aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1308aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13099566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13109371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13119371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13129371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1313aa372e3fSPaul Mullowney 
1314aa372e3fSPaul Mullowney       /* assign the pointer */
1315aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13161a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1317aa372e3fSPaul Mullowney       /* delete temporaries */
1318aa372e3fSPaul Mullowney       if (tempT) {
1319aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1320aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1321aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1322aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1323087f3262SPaul Mullowney       }
1324aa372e3fSPaul Mullowney       if (temp) {
1325aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1326aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1327aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1328aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1329aa372e3fSPaul Mullowney       }
1330afb2bd1cSJunchao Zhang #endif
1331aa372e3fSPaul Mullowney     }
1332a49f1ed0SStefano Zampini   }
1333a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1334a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1335a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
133628b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
133728b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
133828b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
133928b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134028b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134128b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
134228b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
134328b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1344a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1345a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1346a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13479566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1348a49f1ed0SStefano Zampini     }
1349a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1350a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1351792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1352a49f1ed0SStefano Zampini 
1353a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1354a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1355a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1356a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13579371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13589371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13599371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13609566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1361a49f1ed0SStefano Zampini #endif
1362a49f1ed0SStefano Zampini 
13631a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13641a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13651a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13661a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13671a2c6b5cSJunchao Zhang 
13681a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13691a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13701a2c6b5cSJunchao Zhang         */
13719371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1372a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13739371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13749371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1375a49f1ed0SStefano Zampini #else
13769371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13779371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1378a49f1ed0SStefano Zampini #endif
13791a2c6b5cSJunchao Zhang       } else {
13801a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13811a2c6b5cSJunchao Zhang       }
13821a2c6b5cSJunchao Zhang 
1383a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1384792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1385a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13869566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1387a49f1ed0SStefano Zampini #endif
1388a49f1ed0SStefano Zampini     }
13899371c9d4SSatish Balay     PetscCallThrust(
13909371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1391a49f1ed0SStefano Zampini   }
13929566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13939566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1394213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1395213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1396aa372e3fSPaul Mullowney   /* assign the pointer */
1397aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
13981a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
13993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1400bda325fcSPaul Mullowney }
1401bda325fcSPaul Mullowney 
1402b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1403d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1404d460d7bfSJunchao Zhang {
1405d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1406d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1407d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1408d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1409d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1410d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1411d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1412d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1413d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1414d460d7bfSJunchao Zhang 
1415d460d7bfSJunchao Zhang   PetscFunctionBegin;
1416d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1417d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1418d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1419d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1420d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1421d460d7bfSJunchao Zhang 
1422d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1423d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1424d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1425d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1426d460d7bfSJunchao Zhang   } else {
1427d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1428d460d7bfSJunchao Zhang   }
1429d460d7bfSJunchao Zhang 
1430d460d7bfSJunchao Zhang   // Solve L Y = X
1431d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1432d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1433d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1434d460d7bfSJunchao Zhang 
1435d460d7bfSJunchao Zhang   // Solve U X = Y
1436d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1437d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1438d460d7bfSJunchao Zhang   } else {
1439d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1440d460d7bfSJunchao Zhang   }
1441d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1442d460d7bfSJunchao Zhang 
1443d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1444d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1445d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1446d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1447d460d7bfSJunchao Zhang   }
1448d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1449d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1450d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1451d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1452d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1453d460d7bfSJunchao Zhang }
1454d460d7bfSJunchao Zhang 
1455d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1456d460d7bfSJunchao Zhang {
1457d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1458d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1459d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1460d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1461d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1462d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1463d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1464d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1465d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1466d460d7bfSJunchao Zhang 
1467d460d7bfSJunchao Zhang   PetscFunctionBegin;
1468d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1469d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1470d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1471d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1472d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1473d460d7bfSJunchao Zhang 
1474d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1475d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1476d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1477d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1478d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1479d460d7bfSJunchao Zhang   }
1480d460d7bfSJunchao Zhang 
1481d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1482d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1483d460d7bfSJunchao Zhang 
1484d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1485d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1486d460d7bfSJunchao Zhang   }
1487d460d7bfSJunchao Zhang 
1488d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1489d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1490d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1491d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1492d460d7bfSJunchao Zhang 
1493d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1494d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1495d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1496d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1497d460d7bfSJunchao Zhang   } else {
1498d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1499d460d7bfSJunchao Zhang   }
1500d460d7bfSJunchao Zhang 
1501d460d7bfSJunchao Zhang   // Solve Ut Y = X
1502d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1503d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1504d460d7bfSJunchao Zhang 
1505d460d7bfSJunchao Zhang   // Solve Lt X = Y
1506d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1507d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1508d460d7bfSJunchao Zhang   } else {
1509d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1510d460d7bfSJunchao Zhang   }
1511d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1512d460d7bfSJunchao Zhang 
1513d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1514d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1515d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1516d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1517d460d7bfSJunchao Zhang   }
1518d460d7bfSJunchao Zhang 
1519d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1520d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1521d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1522d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1523d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1524d460d7bfSJunchao Zhang }
1525d460d7bfSJunchao Zhang #else
1526a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1527d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1528d71ae5a4SJacob Faibussowitsch {
1529c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1530465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1531465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1532465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1533465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1534bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1535aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1536aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1537aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1538bda325fcSPaul Mullowney 
1539bda325fcSPaul Mullowney   PetscFunctionBegin;
1540aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1541aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1543aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545bda325fcSPaul Mullowney   }
1546bda325fcSPaul Mullowney 
1547bda325fcSPaul Mullowney   /* Get the GPU pointers */
15489566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15499566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1550c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1551c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1552bda325fcSPaul Mullowney 
15539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1554aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15559371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1556aa372e3fSPaul Mullowney 
1557aa372e3fSPaul Mullowney   /* First, solve U */
15589f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15599f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1560aa372e3fSPaul Mullowney 
1561aa372e3fSPaul Mullowney   /* Then, solve L */
15629f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15639f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1564aa372e3fSPaul Mullowney 
1565aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15669371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1567aa372e3fSPaul Mullowney 
1568aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1569a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1570bda325fcSPaul Mullowney 
1571bda325fcSPaul Mullowney   /* restore */
15729566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15739566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15749566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1577bda325fcSPaul Mullowney }
1578bda325fcSPaul Mullowney 
1579d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1580d71ae5a4SJacob Faibussowitsch {
1581465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1582465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1583bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1584aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1585aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1586aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1587bda325fcSPaul Mullowney 
1588bda325fcSPaul Mullowney   PetscFunctionBegin;
1589aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1590aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1592aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594bda325fcSPaul Mullowney   }
1595bda325fcSPaul Mullowney 
1596bda325fcSPaul Mullowney   /* Get the GPU pointers */
15979566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15989566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1599bda325fcSPaul Mullowney 
16009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1601aa372e3fSPaul Mullowney   /* First, solve U */
16029f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16039f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1604aa372e3fSPaul Mullowney 
1605aa372e3fSPaul Mullowney   /* Then, solve L */
16069f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16079f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1608bda325fcSPaul Mullowney 
1609bda325fcSPaul Mullowney   /* restore */
16109566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16119566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1615bda325fcSPaul Mullowney }
1616bda325fcSPaul Mullowney 
1617d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1618d71ae5a4SJacob Faibussowitsch {
1619465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1620465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1621465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1622465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16239ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1624aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1625aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1626aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16279ae82921SPaul Mullowney 
16289ae82921SPaul Mullowney   PetscFunctionBegin;
1629e057df02SPaul Mullowney   /* Get the GPU pointers */
16309566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16319566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1632c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1633c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16349ae82921SPaul Mullowney 
16359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1636aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16379371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1638aa372e3fSPaul Mullowney 
1639aa372e3fSPaul Mullowney   /* Next, solve L */
16409f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16419f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1642aa372e3fSPaul Mullowney 
1643aa372e3fSPaul Mullowney   /* Then, solve U */
16449f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16459f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1646d49cd2b7SBarry Smith 
16474e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16489371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16499ae82921SPaul Mullowney 
16509566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16519566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16559ae82921SPaul Mullowney }
16569ae82921SPaul Mullowney 
1657d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1658d71ae5a4SJacob Faibussowitsch {
1659465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1660465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16619ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1662aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1663aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1664aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16659ae82921SPaul Mullowney 
16669ae82921SPaul Mullowney   PetscFunctionBegin;
1667e057df02SPaul Mullowney   /* Get the GPU pointers */
16689566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16699566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16709ae82921SPaul Mullowney 
16719566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1672aa372e3fSPaul Mullowney   /* First, solve L */
16739f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16749f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1675d49cd2b7SBarry Smith 
1676aa372e3fSPaul Mullowney   /* Next, solve U */
16779f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16789f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16799ae82921SPaul Mullowney 
16809566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16819566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16859ae82921SPaul Mullowney }
1686d460d7bfSJunchao Zhang #endif
16879ae82921SPaul Mullowney 
1688b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
16898eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1690d71ae5a4SJacob Faibussowitsch {
1691da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1692da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1693da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1694da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1695da112707SJunchao Zhang   PetscInt                      m, nz;
1696da112707SJunchao Zhang   PetscBool                     flg;
1697da112707SJunchao Zhang 
1698da112707SJunchao Zhang   PetscFunctionBegin;
1699da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1700da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1701da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1702da112707SJunchao Zhang   }
1703da112707SJunchao Zhang 
1704da112707SJunchao Zhang   /* Copy A's value to fact */
1705da112707SJunchao Zhang   m  = fact->rmap->n;
1706da112707SJunchao Zhang   nz = aij->nz;
1707da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1708da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1709da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1710da112707SJunchao Zhang 
1711bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1712da112707SJunchao Zhang   /* Factorize fact inplace */
17139371c9d4SSatish Balay   if (m)
17149371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1715d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1716da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1717da112707SJunchao Zhang     int              numerical_zero;
1718da112707SJunchao Zhang     cusparseStatus_t status;
1719da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1720da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1721da112707SJunchao Zhang   }
1722da112707SJunchao Zhang 
1723204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1724204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1725204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1726204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1727204a0e31SJunchao Zhang   } else
1728204a0e31SJunchao Zhang   #endif
1729204a0e31SJunchao Zhang   {
173012ba2bc6SJunchao Zhang     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173112ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173212ba2bc6SJunchao Zhang     */
17339371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1734da112707SJunchao Zhang 
17359371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1736da112707SJunchao Zhang 
1737204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
173812ba2bc6SJunchao Zhang     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
173912ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740204a0e31SJunchao Zhang   }
174112ba2bc6SJunchao Zhang 
1742da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1743d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1744d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1745da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1746da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1747bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1748da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1750da112707SJunchao Zhang }
1751da112707SJunchao Zhang 
17528eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1753d71ae5a4SJacob Faibussowitsch {
1754da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1755da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1756da112707SJunchao Zhang   PetscInt                      m, nz;
1757da112707SJunchao Zhang 
1758da112707SJunchao Zhang   PetscFunctionBegin;
1759da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1760da112707SJunchao Zhang     PetscInt  i;
1761da112707SJunchao Zhang     PetscBool flg, missing;
1762da112707SJunchao Zhang 
1763da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1764da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1765da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1766da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1767da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1768da112707SJunchao Zhang   }
1769da112707SJunchao Zhang 
1770da112707SJunchao Zhang   /* Free the old stale stuff */
1771da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1772da112707SJunchao Zhang 
1773da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1774da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1775da112707SJunchao Zhang    */
1776da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1777da112707SJunchao Zhang 
1778da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1779da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1780da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1781da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1782da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1783da112707SJunchao Zhang 
1784da112707SJunchao Zhang   aij->row = NULL;
1785da112707SJunchao Zhang   aij->col = NULL;
1786da112707SJunchao Zhang 
1787da112707SJunchao Zhang   /* ====================================================================== */
1788da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1789da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1790da112707SJunchao Zhang   /* ====================================================================== */
1791da112707SJunchao Zhang   const int *Ai, *Aj;
1792da112707SJunchao Zhang 
1793da112707SJunchao Zhang   m  = fact->rmap->n;
1794da112707SJunchao Zhang   nz = aij->nz;
1795da112707SJunchao Zhang 
1796f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1797f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1798f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1799d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1800d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1802da112707SJunchao Zhang 
1803da112707SJunchao Zhang   /* ====================================================================== */
1804da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1805da112707SJunchao Zhang   /* ====================================================================== */
1806da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1807da112707SJunchao Zhang   cusparseDiagType_t diagType;
1808da112707SJunchao Zhang 
1809da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1810da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1811da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1812da112707SJunchao Zhang 
1813da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1814da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1815da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1816da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1817da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1818da112707SJunchao Zhang   */
1819da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1820da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1821d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18229371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18239371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1824da112707SJunchao Zhang 
1825da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1826da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1827d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18299371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1830da112707SJunchao Zhang 
1831da112707SJunchao Zhang   /* ========================================================================= */
1832da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1833da112707SJunchao Zhang   /* ========================================================================= */
1834da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18359371c9d4SSatish Balay   if (m)
18369371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1837d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1838da112707SJunchao Zhang 
1839da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1840da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1841da112707SJunchao Zhang 
1842da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1843da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1844da112707SJunchao Zhang 
1845da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18469371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1847da112707SJunchao Zhang 
1848da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18499371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1850da112707SJunchao Zhang 
1851da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
185212ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
185312ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
185412ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1855da112707SJunchao Zhang    */
185612ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
185712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
185812ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1859da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
186012ba2bc6SJunchao Zhang   } else {
186112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
186212ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1863da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
186412ba2bc6SJunchao Zhang   }
1865da112707SJunchao Zhang 
1866da112707SJunchao Zhang   /* ========================================================================== */
1867da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1868da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1869da112707SJunchao Zhang   /* ========================================================================== */
1870da112707SJunchao Zhang   int              structural_zero;
1871da112707SJunchao Zhang   cusparseStatus_t status;
1872da112707SJunchao Zhang 
1873da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18749371c9d4SSatish Balay   if (m)
18759371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1876d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1877da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
187846aba097SBarry Smith     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1879da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1880da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1881da112707SJunchao Zhang   }
1882da112707SJunchao Zhang 
1883da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18840dd8c0acSJunchao Zhang   {
1885da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18860dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1887da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1888da112707SJunchao Zhang 
1889da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1890da112707SJunchao Zhang     Ai    = Aseq->i;
1891da112707SJunchao Zhang     Adiag = Aseq->diag;
1892da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1893da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1894da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1895da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1896da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1897da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1898da112707SJunchao Zhang         */
1899da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1900da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1901da112707SJunchao Zhang       }
1902da112707SJunchao Zhang     }
1903da112707SJunchao Zhang     fs->numericFactFlops = flops;
19040dd8c0acSJunchao Zhang   }
1905da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1907da112707SJunchao Zhang }
1908da112707SJunchao Zhang 
1909d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1910d71ae5a4SJacob Faibussowitsch {
1911da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1912da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1913da112707SJunchao Zhang   const PetscScalar            *barray;
1914da112707SJunchao Zhang   PetscScalar                  *xarray;
1915da112707SJunchao Zhang 
1916da112707SJunchao Zhang   PetscFunctionBegin;
1917da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1918da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1919da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1920da112707SJunchao Zhang 
1921da112707SJunchao Zhang   /* Solve L*y = b */
1922da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1923da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19249371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19259371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1926da112707SJunchao Zhang 
1927da112707SJunchao Zhang   /* Solve Lt*x = y */
1928da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19299371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19309371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1931da112707SJunchao Zhang 
1932da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1933da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1934da112707SJunchao Zhang 
1935da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1936da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1938da112707SJunchao Zhang }
1939da112707SJunchao Zhang 
19408eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1941d71ae5a4SJacob Faibussowitsch {
1942da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1943da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1944da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1945da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1946da112707SJunchao Zhang   PetscInt                      m, nz;
1947da112707SJunchao Zhang   PetscBool                     flg;
1948da112707SJunchao Zhang 
1949da112707SJunchao Zhang   PetscFunctionBegin;
1950da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1951da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1952da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1953da112707SJunchao Zhang   }
1954da112707SJunchao Zhang 
1955da112707SJunchao Zhang   /* Copy A's value to fact */
1956da112707SJunchao Zhang   m  = fact->rmap->n;
1957da112707SJunchao Zhang   nz = aij->nz;
1958da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1959da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1960da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1961da112707SJunchao Zhang 
1962da112707SJunchao Zhang   /* Factorize fact inplace */
1963da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
196446aba097SBarry Smith      csric02() only takes the lower triangular part of matrix A to perform factorization.
1965da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1966da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1967da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1968da112707SJunchao Zhang    */
1969d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1970da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1971da112707SJunchao Zhang     int              numerical_zero;
1972da112707SJunchao Zhang     cusparseStatus_t status;
1973da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1974da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1975da112707SJunchao Zhang   }
1976da112707SJunchao Zhang 
1977204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1978204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1979204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1980204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1981204a0e31SJunchao Zhang   } else
1982204a0e31SJunchao Zhang   #endif
1983204a0e31SJunchao Zhang   {
19849371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1985da112707SJunchao Zhang 
1986da112707SJunchao Zhang     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1987da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1988da112707SJunchao Zhang   */
19899371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1990204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
1991204a0e31SJunchao Zhang   }
1992da112707SJunchao Zhang 
1993da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1994da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1995da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1996da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1997da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1998da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
19993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2000da112707SJunchao Zhang }
2001da112707SJunchao Zhang 
20028eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2003d71ae5a4SJacob Faibussowitsch {
2004da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2005da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2006da112707SJunchao Zhang   PetscInt                      m, nz;
2007da112707SJunchao Zhang 
2008da112707SJunchao Zhang   PetscFunctionBegin;
2009da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2010da112707SJunchao Zhang     PetscInt  i;
2011da112707SJunchao Zhang     PetscBool flg, missing;
2012da112707SJunchao Zhang 
2013da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2014da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2015da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2016da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2017da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2018da112707SJunchao Zhang   }
2019da112707SJunchao Zhang 
2020da112707SJunchao Zhang   /* Free the old stale stuff */
2021da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2022da112707SJunchao Zhang 
2023da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2024da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2025da112707SJunchao Zhang    */
2026da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2027da112707SJunchao Zhang 
2028da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2029da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2030da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2031da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2032da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2033da112707SJunchao Zhang 
2034da112707SJunchao Zhang   aij->row = NULL;
2035da112707SJunchao Zhang   aij->col = NULL;
2036da112707SJunchao Zhang 
2037da112707SJunchao Zhang   /* ====================================================================== */
2038da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2039da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2040da112707SJunchao Zhang   /* ====================================================================== */
2041da112707SJunchao Zhang   const int *Ai, *Aj;
2042da112707SJunchao Zhang 
2043da112707SJunchao Zhang   m  = fact->rmap->n;
2044da112707SJunchao Zhang   nz = aij->nz;
2045da112707SJunchao Zhang 
2046f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2047f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2048da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2049da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2050d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2051d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2052da112707SJunchao Zhang 
2053da112707SJunchao Zhang   /* ====================================================================== */
2054da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2055da112707SJunchao Zhang   /* ====================================================================== */
2056da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2057da112707SJunchao Zhang   cusparseDiagType_t diagType;
2058da112707SJunchao Zhang 
2059da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2060da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2061da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2062da112707SJunchao Zhang 
2063da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2064da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2065da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2066da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2067da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2068da112707SJunchao Zhang   */
2069da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2070da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2071d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20729371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2074da112707SJunchao Zhang 
2075da112707SJunchao Zhang   /* ========================================================================= */
2076da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2077da112707SJunchao Zhang   /* ========================================================================= */
2078da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2079d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2080da112707SJunchao Zhang 
2081da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2082da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2083da112707SJunchao Zhang 
2084da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2085da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2086da112707SJunchao Zhang 
2087da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20889371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2089da112707SJunchao Zhang 
2090da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20919371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2092da112707SJunchao Zhang 
209312ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
209412ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
209512ba2bc6SJunchao Zhang    */
209612ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
209712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
209812ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2099da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
210012ba2bc6SJunchao Zhang   } else {
210112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
210212ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
210312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
210412ba2bc6SJunchao Zhang   }
2105da112707SJunchao Zhang 
2106da112707SJunchao Zhang   /* ========================================================================== */
2107da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2108da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2109da112707SJunchao Zhang   /* ========================================================================== */
2110da112707SJunchao Zhang   int              structural_zero;
2111da112707SJunchao Zhang   cusparseStatus_t status;
2112da112707SJunchao Zhang 
2113da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2114d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2115da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
211646aba097SBarry Smith     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2117da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2118da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2119da112707SJunchao Zhang   }
2120da112707SJunchao Zhang 
2121da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21220dd8c0acSJunchao Zhang   {
2123da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21240dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2125da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2126da112707SJunchao Zhang 
2127da112707SJunchao Zhang     Ai = Aseq->i;
2128da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2129da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2130da112707SJunchao Zhang       if (nzRow > 1) {
2131da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2132da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2133da112707SJunchao Zhang         */
2134da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2135da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2136da112707SJunchao Zhang       }
2137da112707SJunchao Zhang     }
2138da112707SJunchao Zhang     fs->numericFactFlops = flops;
21390dd8c0acSJunchao Zhang   }
2140da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2142da112707SJunchao Zhang }
2143da112707SJunchao Zhang #endif
2144da112707SJunchao Zhang 
2145d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2146d460d7bfSJunchao Zhang {
2147b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2148b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2149d460d7bfSJunchao Zhang 
2150d460d7bfSJunchao Zhang   PetscFunctionBegin;
2151d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2152d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2153d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2154d460d7bfSJunchao Zhang 
2155d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2156b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2157d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2158d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2159d460d7bfSJunchao Zhang #else
2160d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2161d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2162d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2163d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2164d460d7bfSJunchao Zhang 
2165d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2166d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2167d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2168d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2169d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2170d460d7bfSJunchao Zhang     } else {
2171d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2172d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2173d460d7bfSJunchao Zhang     }
2174d460d7bfSJunchao Zhang #endif
2175d460d7bfSJunchao Zhang   }
2176d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2177d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2178d460d7bfSJunchao Zhang 
2179d460d7bfSJunchao Zhang   /* get the triangular factors */
2180d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2181d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2182d460d7bfSJunchao Zhang }
2183d460d7bfSJunchao Zhang 
2184d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2185d460d7bfSJunchao Zhang {
2186d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2187d460d7bfSJunchao Zhang 
2188d460d7bfSJunchao Zhang   PetscFunctionBegin;
2189d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2190d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2191d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2192d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2193d460d7bfSJunchao Zhang }
2194d460d7bfSJunchao Zhang 
2195d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2196d71ae5a4SJacob Faibussowitsch {
2197da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2198da112707SJunchao Zhang 
2199da112707SJunchao Zhang   PetscFunctionBegin;
2200b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2201bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2202f82ac72cSJunchao Zhang   if (!info->factoronhost) {
2203da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2204da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2205bc996fdcSJunchao Zhang   }
2206da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2207da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2208da112707SJunchao Zhang   } else
2209da112707SJunchao Zhang #endif
2210da112707SJunchao Zhang   {
2211da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2212da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2213da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2214da112707SJunchao Zhang   }
22153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2216da112707SJunchao Zhang }
2217da112707SJunchao Zhang 
2218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2219d71ae5a4SJacob Faibussowitsch {
2220da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2221da112707SJunchao Zhang 
2222da112707SJunchao Zhang   PetscFunctionBegin;
2223b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2224bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2225f82ac72cSJunchao Zhang   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2226da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2227da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2228da112707SJunchao Zhang   } else
2229da112707SJunchao Zhang #endif
2230da112707SJunchao Zhang   {
2231da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2232da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2233da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2234da112707SJunchao Zhang   }
22353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2236da112707SJunchao Zhang }
2237da112707SJunchao Zhang 
2238d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2239d71ae5a4SJacob Faibussowitsch {
2240da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2241da112707SJunchao Zhang 
2242da112707SJunchao Zhang   PetscFunctionBegin;
2243da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2244da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2245da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2247da112707SJunchao Zhang }
2248da112707SJunchao Zhang 
224966976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2250d71ae5a4SJacob Faibussowitsch {
2251841d4cb1SJunchao Zhang   PetscFunctionBegin;
2252841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2254841d4cb1SJunchao Zhang }
2255841d4cb1SJunchao Zhang 
2256841d4cb1SJunchao Zhang /*MC
2257841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
225811a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2259841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2260841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
226111a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2262841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2263841d4cb1SJunchao Zhang 
2264841d4cb1SJunchao Zhang   Level: beginner
2265841d4cb1SJunchao Zhang 
22661cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22672ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2268841d4cb1SJunchao Zhang M*/
2269841d4cb1SJunchao Zhang 
2270d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2271d71ae5a4SJacob Faibussowitsch {
2272841d4cb1SJunchao Zhang   PetscInt n = A->rmap->n;
2273841d4cb1SJunchao Zhang 
2274841d4cb1SJunchao Zhang   PetscFunctionBegin;
2275841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2276841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2277b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2278841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2279841d4cb1SJunchao Zhang 
2280841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2281841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2282841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2283841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2284841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2285841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2286841d4cb1SJunchao Zhang     } else {
2287841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2288841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2289841d4cb1SJunchao Zhang     }
2290841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2291841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2292841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2293841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2294841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2295841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2296841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2297841d4cb1SJunchao Zhang     } else {
2298841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2299841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2300841d4cb1SJunchao Zhang     }
2301841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2302841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2303841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2304841d4cb1SJunchao Zhang 
2305841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2306841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2307f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2309841d4cb1SJunchao Zhang }
2310841d4cb1SJunchao Zhang 
2311d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2312d71ae5a4SJacob Faibussowitsch {
23137e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23147e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2315b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2316da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23170dd8c0acSJunchao Zhang #endif
23187e8381f9SStefano Zampini 
23197e8381f9SStefano Zampini   PetscFunctionBegin;
23207e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23219566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2322da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2323da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23249566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2325da112707SJunchao Zhang     }
2326b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2327da112707SJunchao Zhang     else if (fs->csrVal) {
2328da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2329da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2330da112707SJunchao Zhang     }
2331da112707SJunchao Zhang #endif
23329371c9d4SSatish Balay     else
23339371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23349566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23359566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23367e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23377e8381f9SStefano Zampini   }
23383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23397e8381f9SStefano Zampini }
23407e8381f9SStefano Zampini 
2341d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2342d71ae5a4SJacob Faibussowitsch {
23437e8381f9SStefano Zampini   PetscFunctionBegin;
23449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
234567a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
234767a45760SJunchao Zhang }
234867a45760SJunchao Zhang 
2349d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2350d71ae5a4SJacob Faibussowitsch {
235167a45760SJunchao Zhang   PetscFunctionBegin;
23527e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235367a45760SJunchao Zhang   *array         = NULL;
23543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235567a45760SJunchao Zhang }
235667a45760SJunchao Zhang 
2357d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2358d71ae5a4SJacob Faibussowitsch {
235967a45760SJunchao Zhang   PetscFunctionBegin;
23609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
236167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236367a45760SJunchao Zhang }
236467a45760SJunchao Zhang 
23658eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2366d71ae5a4SJacob Faibussowitsch {
236767a45760SJunchao Zhang   PetscFunctionBegin;
236867a45760SJunchao Zhang   *array = NULL;
23693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237067a45760SJunchao Zhang }
237167a45760SJunchao Zhang 
2372d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2373d71ae5a4SJacob Faibussowitsch {
237467a45760SJunchao Zhang   PetscFunctionBegin;
237567a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237767a45760SJunchao Zhang }
237867a45760SJunchao Zhang 
2379d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2380d71ae5a4SJacob Faibussowitsch {
238167a45760SJunchao Zhang   PetscFunctionBegin;
238267a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238367a45760SJunchao Zhang   *array         = NULL;
23843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23857e8381f9SStefano Zampini }
23867e8381f9SStefano Zampini 
2387d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2388d71ae5a4SJacob Faibussowitsch {
23897ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23907ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23917ee59b9bSJunchao Zhang 
23927ee59b9bSJunchao Zhang   PetscFunctionBegin;
23937ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23947ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
23957ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
23967ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
23977ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
23987ee59b9bSJunchao Zhang 
23997ee59b9bSJunchao Zhang   if (i) {
24007ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24017ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24027ee59b9bSJunchao Zhang #else
24037ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24047ee59b9bSJunchao Zhang #endif
24057ee59b9bSJunchao Zhang   }
24067ee59b9bSJunchao Zhang   if (j) {
24077ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24087ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24097ee59b9bSJunchao Zhang #else
24107ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24117ee59b9bSJunchao Zhang #endif
24127ee59b9bSJunchao Zhang   }
24137ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24147ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24167ee59b9bSJunchao Zhang }
24177ee59b9bSJunchao Zhang 
2418d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2419d71ae5a4SJacob Faibussowitsch {
2420aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24217c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24229ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2423213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2424aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2425abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24269ae82921SPaul Mullowney 
24279ae82921SPaul Mullowney   PetscFunctionBegin;
242828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2429c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2430a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2431a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2432afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243385ba7357SStefano Zampini 
243408401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24359566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2436afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24379566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2438f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24399566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24409566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
244134d6c7a5SJose E. Roman     } else {
2442abb89eb1SStefano Zampini       PetscInt nnz;
24439566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24449566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24467c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
244781902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2448a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2449a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24509ae82921SPaul Mullowney       try {
24519ae82921SPaul Mullowney         if (a->compressedrow.use) {
24529ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24539ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24549ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24559ae82921SPaul Mullowney         } else {
2456213423ffSJunchao Zhang           m    = A->rmap->n;
2457213423ffSJunchao Zhang           ii   = a->i;
2458e6e9a74fSStefano Zampini           ridx = NULL;
24599ae82921SPaul Mullowney         }
246008401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24619371c9d4SSatish Balay         if (!a->a) {
24629371c9d4SSatish Balay           nnz  = ii[m];
24639371c9d4SSatish Balay           both = PETSC_FALSE;
24649371c9d4SSatish Balay         } else nnz = a->nz;
246508401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24669ae82921SPaul Mullowney 
246785ba7357SStefano Zampini         /* create cusparse matrix */
2468abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2469aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24739ae82921SPaul Mullowney 
2474f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2475f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2476f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
24779566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2481b06137fdSPaul Mullowney 
2482aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2483aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2484aa372e3fSPaul Mullowney           /* set the matrix */
2485afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2486afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2487afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2488abb89eb1SStefano Zampini           mat->num_entries = nnz;
2489ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2490afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2491ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2492abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2493aa372e3fSPaul Mullowney 
2494ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2495abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2496aa372e3fSPaul Mullowney 
2497aa372e3fSPaul Mullowney           /* assign the pointer */
2498afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2499afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25019371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25029371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25039371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2504afb2bd1cSJunchao Zhang           }
2505afb2bd1cSJunchao Zhang #endif
2506aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509afb2bd1cSJunchao Zhang #else
2510afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2511afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2512afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2513abb89eb1SStefano Zampini           mat->num_entries = nnz;
2514ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2515afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2516aa372e3fSPaul Mullowney 
2517ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2518abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2519aa372e3fSPaul Mullowney 
2520ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2521abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522aa372e3fSPaul Mullowney 
2523aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25249566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25259371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25269371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25279371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2528aa372e3fSPaul Mullowney           /* assign the pointer */
2529aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2530aa372e3fSPaul Mullowney 
2531afb2bd1cSJunchao Zhang           if (mat) {
2532afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2536087f3262SPaul Mullowney           }
2537afb2bd1cSJunchao Zhang #endif
2538087f3262SPaul Mullowney         }
2539ca45077fSPaul Mullowney 
2540aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2541213423ffSJunchao Zhang         if (a->compressedrow.use) {
2542ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2543ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2544aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2545213423ffSJunchao Zhang           tmp = m;
2546213423ffSJunchao Zhang         } else {
2547213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2548213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2549213423ffSJunchao Zhang           tmp                        = 0;
2550213423ffSJunchao Zhang         }
25519566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2552aa372e3fSPaul Mullowney 
2553aa372e3fSPaul Mullowney         /* assign the pointer */
2554aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2555d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2556d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557d71ae5a4SJacob Faibussowitsch       }
25589566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25599566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
256034d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256134d6c7a5SJose E. Roman     }
2562abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25639ae82921SPaul Mullowney   }
25643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25659ae82921SPaul Mullowney }
25669ae82921SPaul Mullowney 
25679371c9d4SSatish Balay struct VecCUDAPlusEquals {
2568aa372e3fSPaul Mullowney   template <typename Tuple>
2569d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2570d71ae5a4SJacob Faibussowitsch   {
2571aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572aa372e3fSPaul Mullowney   }
2573aa372e3fSPaul Mullowney };
2574aa372e3fSPaul Mullowney 
25759371c9d4SSatish Balay struct VecCUDAEquals {
25767e8381f9SStefano Zampini   template <typename Tuple>
2577d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2578d71ae5a4SJacob Faibussowitsch   {
25797e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25807e8381f9SStefano Zampini   }
25817e8381f9SStefano Zampini };
25827e8381f9SStefano Zampini 
25839371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2584e6e9a74fSStefano Zampini   template <typename Tuple>
2585d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2586d71ae5a4SJacob Faibussowitsch   {
2587e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2588e6e9a74fSStefano Zampini   }
2589e6e9a74fSStefano Zampini };
2590e6e9a74fSStefano Zampini 
2591*cc1eb50dSBarry Smith struct MatProductCtx_MatMatCusparse {
2592ccdfe979SStefano Zampini   PetscBool      cisdense;
2593ccdfe979SStefano Zampini   PetscScalar   *Bt;
2594ccdfe979SStefano Zampini   Mat            X;
2595fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596fcdce8c4SStefano Zampini   PetscLogDouble flops;
2597fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2598b4285af6SJunchao Zhang 
2599afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2601afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2603afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2604afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606b4285af6SJunchao Zhang   void *dBuffer4;
2607b4285af6SJunchao Zhang   void *dBuffer5;
2608b4285af6SJunchao Zhang   #endif
2609fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2610fcdce8c4SStefano Zampini   void                 *mmBuffer;
2611fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2613afb2bd1cSJunchao Zhang #endif
2614afb2bd1cSJunchao Zhang };
2615ccdfe979SStefano Zampini 
2616*cc1eb50dSBarry Smith static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(void **data)
2617d71ae5a4SJacob Faibussowitsch {
2618*cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2619ccdfe979SStefano Zampini 
2620ccdfe979SStefano Zampini   PetscFunctionBegin;
26219566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2622fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2623afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26249566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26259566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26269566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26279566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26299566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26309566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631b4285af6SJunchao Zhang   #endif
26329566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26339566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634afb2bd1cSJunchao Zhang #endif
26359566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
2636*cc1eb50dSBarry Smith   PetscCall(PetscFree(*data));
26373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2638ccdfe979SStefano Zampini }
2639ccdfe979SStefano Zampini 
26404742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2641ccdfe979SStefano Zampini 
2642d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643d71ae5a4SJacob Faibussowitsch {
2644ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2645ccdfe979SStefano Zampini   Mat                           A, B;
2646afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2647ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2648ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2649ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2650ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2651ccdfe979SStefano Zampini   const PetscScalar            *barray;
2652ccdfe979SStefano Zampini   PetscScalar                  *carray;
2653*cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2654ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2656ccdfe979SStefano Zampini 
2657ccdfe979SStefano Zampini   PetscFunctionBegin;
2658ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
265928b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660*cc1eb50dSBarry Smith   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2661ccdfe979SStefano Zampini   A      = product->A;
2662ccdfe979SStefano Zampini   B      = product->B;
26639566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
266728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670ccdfe979SStefano Zampini   switch (product->type) {
2671ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2672ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2673ccdfe979SStefano Zampini     mat = cusp->mat;
2674ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675ccdfe979SStefano Zampini     m   = A->rmap->n;
2676ccdfe979SStefano Zampini     n   = B->cmap->n;
2677ccdfe979SStefano Zampini     break;
2678ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26791a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2680e6e9a74fSStefano Zampini       mat = cusp->mat;
2681e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682e6e9a74fSStefano Zampini     } else {
26839566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2685ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686e6e9a74fSStefano Zampini     }
2687ccdfe979SStefano Zampini     m = A->cmap->n;
2688ccdfe979SStefano Zampini     n = B->cmap->n;
2689ccdfe979SStefano Zampini     break;
2690ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2691ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2692ccdfe979SStefano Zampini     mat = cusp->mat;
2693ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694ccdfe979SStefano Zampini     m   = A->rmap->n;
2695ccdfe979SStefano Zampini     n   = B->rmap->n;
2696ccdfe979SStefano Zampini     break;
2697d71ae5a4SJacob Faibussowitsch   default:
2698d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699ccdfe979SStefano Zampini   }
270028b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2702ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27039566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27049566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2706afb2bd1cSJunchao Zhang 
27079566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2708c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27109566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711c8378d12SStefano Zampini   } else {
2712cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27139566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2714c8378d12SStefano Zampini   }
2715c8378d12SStefano Zampini 
27169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2717afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2720fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2721fe5544b9SJunchao Zhang   #else
2722fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2723fe5544b9SJunchao Zhang   #endif
2724fe5544b9SJunchao Zhang 
2725a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2726afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2727fcdce8c4SStefano Zampini     size_t mmBufferSize;
27289371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27299371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27309371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27319371c9d4SSatish Balay     }
2732afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27339566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2734afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2735afb2bd1cSJunchao Zhang     }
2736c8378d12SStefano Zampini 
27379371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27389371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27399371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27409371c9d4SSatish Balay     }
2741afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27429566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2743afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2744afb2bd1cSJunchao Zhang     }
2745afb2bd1cSJunchao Zhang 
2746fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2747fe5544b9SJunchao Zhang     if (matADescr) {
274817f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2749fe5544b9SJunchao Zhang       matADescr = NULL;
2750fe5544b9SJunchao Zhang     }
2751fe5544b9SJunchao Zhang   #endif
2752fe5544b9SJunchao Zhang 
2753fe5544b9SJunchao Zhang     if (!matADescr) {
2754fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27559371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27569371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2757afb2bd1cSJunchao Zhang     }
2758fe5544b9SJunchao Zhang 
2759fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2760fe5544b9SJunchao Zhang 
2761fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27639566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2764fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2765fcdce8c4SStefano Zampini     }
2766fe5544b9SJunchao Zhang 
2767f0b74427SPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2768fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2769fe5544b9SJunchao Zhang   #endif
2770fe5544b9SJunchao Zhang 
2771afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2772afb2bd1cSJunchao Zhang   } else {
2773afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2774fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
27759566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27769566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2777afb2bd1cSJunchao Zhang   }
2778afb2bd1cSJunchao Zhang 
2779afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2780fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2781afb2bd1cSJunchao Zhang #else
2782afb2bd1cSJunchao Zhang   PetscInt k;
2783afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2784ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2785ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2786ccdfe979SStefano Zampini     cublasStatus_t cerr;
2787ccdfe979SStefano Zampini 
27889566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27899371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27909371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2791ccdfe979SStefano Zampini     blda = B->cmap->n;
2792afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2793afb2bd1cSJunchao Zhang   } else {
2794afb2bd1cSJunchao Zhang     k = B->rmap->n;
2795ccdfe979SStefano Zampini   }
2796ccdfe979SStefano Zampini 
2797afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
27989371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
27999371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2800afb2bd1cSJunchao Zhang #endif
28019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2803cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2804ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2805cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28064742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2807ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2808cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28094742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2810ccdfe979SStefano Zampini   } else {
2811cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2812ccdfe979SStefano Zampini   }
281348a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
281448a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2816ccdfe979SStefano Zampini }
2817ccdfe979SStefano Zampini 
2818d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2819d71ae5a4SJacob Faibussowitsch {
2820ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2821ccdfe979SStefano Zampini   Mat                           A, B;
2822ccdfe979SStefano Zampini   PetscInt                      m, n;
2823ccdfe979SStefano Zampini   PetscBool                     cisdense, flg;
2824*cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2825ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2826ccdfe979SStefano Zampini 
2827ccdfe979SStefano Zampini   PetscFunctionBegin;
2828ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
282928b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2830ccdfe979SStefano Zampini   A = product->A;
2831ccdfe979SStefano Zampini   B = product->B;
28329566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
283328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2834ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
283508401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2836ccdfe979SStefano Zampini   switch (product->type) {
2837ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2838ccdfe979SStefano Zampini     m = A->rmap->n;
2839ccdfe979SStefano Zampini     n = B->cmap->n;
28400e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2841ccdfe979SStefano Zampini     break;
2842ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2843ccdfe979SStefano Zampini     m = A->cmap->n;
2844ccdfe979SStefano Zampini     n = B->cmap->n;
28450e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
28460e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2847ccdfe979SStefano Zampini     break;
2848ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2849ccdfe979SStefano Zampini     m = A->rmap->n;
2850ccdfe979SStefano Zampini     n = B->rmap->n;
28510e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
28520e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2853ccdfe979SStefano Zampini     break;
2854ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2855ccdfe979SStefano Zampini     m = B->cmap->n;
2856ccdfe979SStefano Zampini     n = B->cmap->n;
28570e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
28580e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2859ccdfe979SStefano Zampini     break;
2860ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2861ccdfe979SStefano Zampini     m = B->rmap->n;
2862ccdfe979SStefano Zampini     n = B->rmap->n;
28630e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
28640e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2865ccdfe979SStefano Zampini     break;
2866d71ae5a4SJacob Faibussowitsch   default:
2867d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2868ccdfe979SStefano Zampini   }
28699566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2870ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28719566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28729566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2873ccdfe979SStefano Zampini 
2874ccdfe979SStefano Zampini   /* product data */
28759566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2876ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2877afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2878afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
287948a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2880afb2bd1cSJunchao Zhang #endif
2881ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2882ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28839566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28849566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2885ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28869566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2887ccdfe979SStefano Zampini     } else {
28889566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2889ccdfe979SStefano Zampini     }
2890ccdfe979SStefano Zampini   }
2891ccdfe979SStefano Zampini   C->product->data    = mmdata;
2892*cc1eb50dSBarry Smith   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2893ccdfe979SStefano Zampini 
2894ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
28953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2896ccdfe979SStefano Zampini }
2897ccdfe979SStefano Zampini 
2898d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2899d71ae5a4SJacob Faibussowitsch {
2900ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2901fcdce8c4SStefano Zampini   Mat                           A, B;
2902fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2903fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2904fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2905fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2906fcdce8c4SStefano Zampini   PetscBool                     flg;
2907fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2908fcdce8c4SStefano Zampini   MatProductType                ptype;
2909*cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
2910fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2911fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2912fcdce8c4SStefano Zampini #endif
2913b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2914ccdfe979SStefano Zampini 
2915ccdfe979SStefano Zampini   PetscFunctionBegin;
2916ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
291728b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29189566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
291928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2920*cc1eb50dSBarry Smith   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2921fcdce8c4SStefano Zampini   A      = product->A;
2922fcdce8c4SStefano Zampini   B      = product->B;
2923fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2924fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2925fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
292608401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2927fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
292828b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2929fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
293028b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2931fcdce8c4SStefano Zampini     goto finalize;
2932fcdce8c4SStefano Zampini   }
2933fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29349566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
293528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29369566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
293728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
293828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
293928b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2940fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2941fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2942fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
294308401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294408401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294508401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2948fcdce8c4SStefano Zampini 
2949fcdce8c4SStefano Zampini   ptype = product->type;
2950b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2951fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295228b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2953fa046f9fSJunchao Zhang   }
2954b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2955fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295628b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2957fa046f9fSJunchao Zhang   }
2958fcdce8c4SStefano Zampini   switch (ptype) {
2959fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2960fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2961fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2962fcdce8c4SStefano Zampini     break;
2963fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2964fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2965fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2966fcdce8c4SStefano Zampini     break;
2967fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2968fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2969fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2970fcdce8c4SStefano Zampini     break;
2971d71ae5a4SJacob Faibussowitsch   default:
2972d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2973fcdce8c4SStefano Zampini   }
2974fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
297528b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
297628b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
297728b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2978fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2979fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2980fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
298128b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
298228b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
298328b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2985fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2986fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29879566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2988b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29899371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29909371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2991b4285af6SJunchao Zhang   #else
29929371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29939371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29949371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29959371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2996b4285af6SJunchao Zhang   #endif
2997fcdce8c4SStefano Zampini #else
29989371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29999371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3001fcdce8c4SStefano Zampini #endif
30029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30039566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3005fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3006fcdce8c4SStefano Zampini finalize:
3007fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30089566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30099566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30109566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3011fcdce8c4SStefano Zampini   c->reallocs = 0;
3012fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3013fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3014fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3015fcdce8c4SStefano Zampini   C->num_ass++;
30163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3017ccdfe979SStefano Zampini }
3018fcdce8c4SStefano Zampini 
3019d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3020d71ae5a4SJacob Faibussowitsch {
3021fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3022fcdce8c4SStefano Zampini   Mat                           A, B;
3023fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3024fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3025fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3026fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3027fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3028fcdce8c4SStefano Zampini   PetscBool                     flg;
3029fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3030fcdce8c4SStefano Zampini   MatProductType                ptype;
3031*cc1eb50dSBarry Smith   MatProductCtx_MatMatCusparse *mmdata;
3032fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3033fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3034fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3035fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3036fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3037fcdce8c4SStefano Zampini #else
3038fcdce8c4SStefano Zampini   int cnz;
3039fcdce8c4SStefano Zampini #endif
3040b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3041fcdce8c4SStefano Zampini 
3042fcdce8c4SStefano Zampini   PetscFunctionBegin;
3043fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
304428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3045fcdce8c4SStefano Zampini   A = product->A;
3046fcdce8c4SStefano Zampini   B = product->B;
30479566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
304828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30499566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
305028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3051fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3052fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3053fcdce8c4SStefano Zampini   /* product data */
30549566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3055fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3056*cc1eb50dSBarry Smith   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3057fcdce8c4SStefano Zampini 
30589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3060d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3061d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
306208401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
306308401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3064d60bce21SJunchao Zhang 
3065fcdce8c4SStefano Zampini   ptype = product->type;
3066b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3067fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3068fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3069fa046f9fSJunchao Zhang   }
3070b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3071fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3072fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3073fa046f9fSJunchao Zhang   }
3074fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3075fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3076fcdce8c4SStefano Zampini   switch (ptype) {
3077fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3078fcdce8c4SStefano Zampini     m    = A->rmap->n;
3079fcdce8c4SStefano Zampini     n    = B->cmap->n;
3080fcdce8c4SStefano Zampini     k    = A->cmap->n;
3081fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3082fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3083fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3084fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3085fcdce8c4SStefano Zampini     break;
3086fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3087fcdce8c4SStefano Zampini     m = A->cmap->n;
3088fcdce8c4SStefano Zampini     n = B->cmap->n;
3089fcdce8c4SStefano Zampini     k = A->rmap->n;
30909566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3091fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3092fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3093fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3094fcdce8c4SStefano Zampini     break;
3095fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3096fcdce8c4SStefano Zampini     m = A->rmap->n;
3097fcdce8c4SStefano Zampini     n = B->rmap->n;
3098fcdce8c4SStefano Zampini     k = A->cmap->n;
30999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3100fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3101fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3102fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3103fcdce8c4SStefano Zampini     break;
3104d71ae5a4SJacob Faibussowitsch   default:
3105d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3106fcdce8c4SStefano Zampini   }
3107fcdce8c4SStefano Zampini 
3108fcdce8c4SStefano Zampini   /* create cusparse matrix */
31099566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31109566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3111fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3112fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3113fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3114fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3115fcdce8c4SStefano Zampini 
3116fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3117fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3118fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31199566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31209566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3121fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3122fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3123fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3124fcdce8c4SStefano Zampini   } else {
3125fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3126fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3127fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3128fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3129fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3130fcdce8c4SStefano Zampini   }
3131fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3132fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3133fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3134fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3135fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3136fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31379566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31389566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31399566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3140f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3141f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3142f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31439566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31449566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3146fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3147d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3148fcdce8c4SStefano Zampini     c->nz                = 0;
3149fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3150fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3151fcdce8c4SStefano Zampini     goto finalizesym;
3152fcdce8c4SStefano Zampini   }
3153fcdce8c4SStefano Zampini 
315428b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
315528b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3156fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3157fcdce8c4SStefano Zampini   if (!biscompressed) {
3158fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3159fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3160fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3161fcdce8c4SStefano Zampini #endif
3162fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3163fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3164fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3165fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3166fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3167fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3168fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3169fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3170fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3171fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3172fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31739566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3174fcdce8c4SStefano Zampini     }
3175fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3176fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3177fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3178fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31799371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31809371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3181fcdce8c4SStefano Zampini     }
3182fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3183fcdce8c4SStefano Zampini #endif
3184fcdce8c4SStefano Zampini   }
318528b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
318628b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3187fcdce8c4SStefano Zampini   /* precompute flops count */
3188fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3189fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3190fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3191fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3192fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3193fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3194fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3195fcdce8c4SStefano Zampini       }
3196fcdce8c4SStefano Zampini     }
3197fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3198fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3199fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3200fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3201fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3202fcdce8c4SStefano Zampini     }
3203fcdce8c4SStefano Zampini   } else { /* TODO */
3204fcdce8c4SStefano Zampini     flops = 0.;
3205fcdce8c4SStefano Zampini   }
3206fcdce8c4SStefano Zampini 
3207fcdce8c4SStefano Zampini   mmdata->flops = flops;
32089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3209b4285af6SJunchao Zhang 
3210fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32121ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32131ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32149371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32159566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3216b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3217b4285af6SJunchao Zhang   {
3218b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3219b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3220b4285af6SJunchao Zhang   */
3221b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3222b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3223b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3224b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3225b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3226b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3227b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3228b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3229b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3230b4285af6SJunchao Zhang 
3231b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32329371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32339371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3235b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32369371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32379371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3238b4285af6SJunchao Zhang 
32399371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32409371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3248b4285af6SJunchao Zhang 
3249b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32509566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3251b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3252b4285af6SJunchao Zhang     /* allocate matrix C */
32539371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32549371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32559371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32569371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3257b4285af6SJunchao Zhang     /* update matC with the new pointers */
32589371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32599371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3260b4285af6SJunchao Zhang 
32619371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32629371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32649371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32659371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32679371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32689371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32699566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3270b4285af6SJunchao Zhang   }
3271ae37ee31SJunchao Zhang   #else
3272b4285af6SJunchao Zhang   size_t bufSize2;
3273fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32749371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32759371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32769566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3277fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32789371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32799371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3280fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32819371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32829371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3283fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3284fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3285fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3286fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3287fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32889566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3289fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32909371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32919371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3292fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32939566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3294fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
32959371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
32969371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3297fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32989566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3299fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33009566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33019371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33029371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33039371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33049371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3305ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3306fcdce8c4SStefano Zampini #else
33079566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33089371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33099371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33109371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3311fcdce8c4SStefano Zampini   c->nz                = cnz;
3312fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3314fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3316fcdce8c4SStefano Zampini 
33179566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3318fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3319fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3320fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33219371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33229371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33239371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3324fcdce8c4SStefano Zampini #endif
33259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3327fcdce8c4SStefano Zampini finalizesym:
3328fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33299f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33309f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3331fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33327de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3333fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3334fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3335fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3336fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3337fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3338fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3341fcdce8c4SStefano Zampini   } else {
3342fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3343fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346fcdce8c4SStefano Zampini   }
3347fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3348fcdce8c4SStefano Zampini     PetscInt r = 0;
3349fcdce8c4SStefano Zampini     c->i[0]    = 0;
3350fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3351fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3352fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3353fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3354fcdce8c4SStefano Zampini     }
3355fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3356fcdce8c4SStefano Zampini   }
33579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3360fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3361fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3362fcdce8c4SStefano Zampini   c->rmax          = 0;
3363fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3364fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3365fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3366fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3367fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3368fcdce8c4SStefano Zampini   }
33699566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3371fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3372fcdce8c4SStefano Zampini 
3373fcdce8c4SStefano Zampini   C->nonzerostate++;
33749566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33759566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3376fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3377fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3378fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3379fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3380fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3381abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3382fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3383fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3384fcdce8c4SStefano Zampini   }
3385fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3387fcdce8c4SStefano Zampini }
3388fcdce8c4SStefano Zampini 
3389fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3390fcdce8c4SStefano Zampini 
3391fcdce8c4SStefano Zampini /* handles sparse or dense B */
3392d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3393d71ae5a4SJacob Faibussowitsch {
3394fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3395fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3396fcdce8c4SStefano Zampini 
3397fcdce8c4SStefano Zampini   PetscFunctionBegin;
3398fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
33999566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
340048a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3401fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3402fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
340348a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3404fcdce8c4SStefano Zampini   }
340565e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
340665e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
340765e4b4d4SStefano Zampini     switch (product->type) {
340865e4b4d4SStefano Zampini     case MATPRODUCT_AB:
340965e4b4d4SStefano Zampini       if (product->api_user) {
3410d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34119566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3412d0609cedSBarry Smith         PetscOptionsEnd();
341365e4b4d4SStefano Zampini       } else {
3414d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34159566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416d0609cedSBarry Smith         PetscOptionsEnd();
341765e4b4d4SStefano Zampini       }
341865e4b4d4SStefano Zampini       break;
341965e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
342065e4b4d4SStefano Zampini       if (product->api_user) {
3421d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34229566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3423d0609cedSBarry Smith         PetscOptionsEnd();
342465e4b4d4SStefano Zampini       } else {
3425d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34269566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427d0609cedSBarry Smith         PetscOptionsEnd();
342865e4b4d4SStefano Zampini       }
342965e4b4d4SStefano Zampini       break;
343065e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
343165e4b4d4SStefano Zampini       if (product->api_user) {
3432d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34339566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3434d0609cedSBarry Smith         PetscOptionsEnd();
343565e4b4d4SStefano Zampini       } else {
3436d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438d0609cedSBarry Smith         PetscOptionsEnd();
343965e4b4d4SStefano Zampini       }
344065e4b4d4SStefano Zampini       break;
344165e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
344265e4b4d4SStefano Zampini       if (product->api_user) {
3443d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34449566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3445d0609cedSBarry Smith         PetscOptionsEnd();
344665e4b4d4SStefano Zampini       } else {
3447d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449d0609cedSBarry Smith         PetscOptionsEnd();
345065e4b4d4SStefano Zampini       }
345165e4b4d4SStefano Zampini       break;
345265e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
345365e4b4d4SStefano Zampini       if (product->api_user) {
3454d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34559566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3456d0609cedSBarry Smith         PetscOptionsEnd();
345765e4b4d4SStefano Zampini       } else {
3458d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460d0609cedSBarry Smith         PetscOptionsEnd();
346165e4b4d4SStefano Zampini       }
346265e4b4d4SStefano Zampini       break;
3463d71ae5a4SJacob Faibussowitsch     default:
3464d71ae5a4SJacob Faibussowitsch       break;
346565e4b4d4SStefano Zampini     }
346665e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
346765e4b4d4SStefano Zampini   }
346865e4b4d4SStefano Zampini   /* dispatch */
3469fcdce8c4SStefano Zampini   if (isdense) {
3470ccdfe979SStefano Zampini     switch (product->type) {
3471ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3472ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3473ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3474ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3475ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3476fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34779566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3478fcdce8c4SStefano Zampini       } else {
3479fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3480fcdce8c4SStefano Zampini       }
3481fcdce8c4SStefano Zampini       break;
3482d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3483d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3484d71ae5a4SJacob Faibussowitsch       break;
3485d71ae5a4SJacob Faibussowitsch     default:
3486d71ae5a4SJacob Faibussowitsch       break;
3487ccdfe979SStefano Zampini     }
3488fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3489fcdce8c4SStefano Zampini     switch (product->type) {
3490fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3491fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3492d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3493d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3494d71ae5a4SJacob Faibussowitsch       break;
3495fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3496fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3497d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3498d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3499d71ae5a4SJacob Faibussowitsch       break;
3500d71ae5a4SJacob Faibussowitsch     default:
3501d71ae5a4SJacob Faibussowitsch       break;
3502fcdce8c4SStefano Zampini     }
3503fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35049566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3505fcdce8c4SStefano Zampini   }
35063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3507ccdfe979SStefano Zampini }
3508ccdfe979SStefano Zampini 
3509d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510d71ae5a4SJacob Faibussowitsch {
35119ae82921SPaul Mullowney   PetscFunctionBegin;
35129566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3514e6e9a74fSStefano Zampini }
3515e6e9a74fSStefano Zampini 
3516d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3517d71ae5a4SJacob Faibussowitsch {
3518e6e9a74fSStefano Zampini   PetscFunctionBegin;
35199566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3521e6e9a74fSStefano Zampini }
3522e6e9a74fSStefano Zampini 
3523d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3524d71ae5a4SJacob Faibussowitsch {
3525e6e9a74fSStefano Zampini   PetscFunctionBegin;
35269566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3528e6e9a74fSStefano Zampini }
3529e6e9a74fSStefano Zampini 
3530d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3531d71ae5a4SJacob Faibussowitsch {
3532e6e9a74fSStefano Zampini   PetscFunctionBegin;
35339566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35359ae82921SPaul Mullowney }
35369ae82921SPaul Mullowney 
3537d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538d71ae5a4SJacob Faibussowitsch {
3539ca45077fSPaul Mullowney   PetscFunctionBegin;
35409566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3542ca45077fSPaul Mullowney }
3543ca45077fSPaul Mullowney 
3544d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3545d71ae5a4SJacob Faibussowitsch {
3546a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3547a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3548a0e72f99SJunchao Zhang }
3549a0e72f99SJunchao Zhang 
3550afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3551d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3552d71ae5a4SJacob Faibussowitsch {
35539ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3554aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35559ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3556e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3557e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3558e6e9a74fSStefano Zampini   PetscBool                     compressed;
3559afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3561afb2bd1cSJunchao Zhang #endif
35626e111a19SKarl Rupp 
35639ae82921SPaul Mullowney   PetscFunctionBegin;
356408401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3565cbc6b225SStefano Zampini   if (!a->nz) {
3566995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3567995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35683ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3569e6e9a74fSStefano Zampini   }
357034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3572e6e9a74fSStefano Zampini   if (!trans) {
35739ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35745f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3575e6e9a74fSStefano Zampini   } else {
35761a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3577e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3578e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579e6e9a74fSStefano Zampini     } else {
35809566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3581e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3582e6e9a74fSStefano Zampini     }
3583e6e9a74fSStefano Zampini   }
3584e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3585e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3586213423ffSJunchao Zhang 
3587e6e9a74fSStefano Zampini   try {
35889566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
358969d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35909566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3591afb2bd1cSJunchao Zhang 
35929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3593e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3594afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3595afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3596afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3597afb2bd1cSJunchao Zhang       */
3598e6e9a74fSStefano Zampini       xptr = xarray;
3599afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3600213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3601afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3603afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3604afb2bd1cSJunchao Zhang        */
3605afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3606afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3607fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3608afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3609afb2bd1cSJunchao Zhang       }
3610afb2bd1cSJunchao Zhang #endif
3611e6e9a74fSStefano Zampini     } else {
3612afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3613afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3614afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3615afb2bd1cSJunchao Zhang        */
3616afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3617e6e9a74fSStefano Zampini       dptr = zarray;
3618e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3619afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3620e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3621d0967f54SJacob Faibussowitsch 
3622d0967f54SJacob Faibussowitsch         thrust::for_each(
3623d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3624d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3625d0967f54SJacob Faibussowitsch #endif
3626d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36279371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3628e6e9a74fSStefano Zampini       }
3629afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3633afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3634afb2bd1cSJunchao Zhang       }
3635afb2bd1cSJunchao Zhang #endif
3636e6e9a74fSStefano Zampini     }
36379ae82921SPaul Mullowney 
3638afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3639aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3640afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3642fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3643fe5544b9SJunchao Zhang   #else
3644fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3645fe5544b9SJunchao Zhang   #endif
3646fe5544b9SJunchao Zhang 
36475f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3648fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3649fe5544b9SJunchao Zhang       if (!matDescr) {
3650fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3651fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3652fe5544b9SJunchao Zhang       }
3653fe5544b9SJunchao Zhang   #endif
3654fe5544b9SJunchao Zhang 
3655afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36589371c9d4SSatish Balay         PetscCallCUSPARSE(
3659fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3661fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3662fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3663fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664fe5544b9SJunchao Zhang   #endif
3665afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3666afb2bd1cSJunchao Zhang       } else {
3667afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3670afb2bd1cSJunchao Zhang       }
3671afb2bd1cSJunchao Zhang 
3672fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3673afb2bd1cSJunchao Zhang #else
36747656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36759371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3676afb2bd1cSJunchao Zhang #endif
3677aa372e3fSPaul Mullowney     } else {
3678213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3679afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3680afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681afb2bd1cSJunchao Zhang #else
3682301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36839371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3684afb2bd1cSJunchao Zhang #endif
3685a65300a6SPaul Mullowney       }
3686aa372e3fSPaul Mullowney     }
36879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3688aa372e3fSPaul Mullowney 
3689e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3690213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3691213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3692995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3693e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3694995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
36957656d835SStefano Zampini         }
3696213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3697995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
36987656d835SStefano Zampini       }
36997656d835SStefano Zampini 
3700213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3701213423ffSJunchao Zhang       if (compressed) {
37029566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
37036497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37046497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
37059566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3706e6e9a74fSStefano Zampini       }
3707e6e9a74fSStefano Zampini     } else {
3708995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3709e6e9a74fSStefano Zampini     }
37109566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37119566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37129566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3713d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3714d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3715d71ae5a4SJacob Faibussowitsch   }
3716e6e9a74fSStefano Zampini   if (yy) {
37179566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3718e6e9a74fSStefano Zampini   } else {
37199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3720e6e9a74fSStefano Zampini   }
37213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37229ae82921SPaul Mullowney }
37239ae82921SPaul Mullowney 
3724d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3725d71ae5a4SJacob Faibussowitsch {
3726ca45077fSPaul Mullowney   PetscFunctionBegin;
37279566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3729ca45077fSPaul Mullowney }
3730ca45077fSPaul Mullowney 
37319ee18893SBarry Smith PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
37329ee18893SBarry Smith 
37339ee18893SBarry Smith __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
37349ee18893SBarry Smith {
37359ee18893SBarry Smith   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
37369ee18893SBarry Smith 
37379ee18893SBarry Smith   if (x < len) {
37389ee18893SBarry Smith     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
37399ee18893SBarry Smith     PetscScalar    d = 0.0;
37409ee18893SBarry Smith 
37419ee18893SBarry Smith     for (PetscInt i = 0; i < num_non0_row; i++) {
37429ee18893SBarry Smith       if (col[i + rowx] == x) {
37439ee18893SBarry Smith         d = val[i + rowx];
37449ee18893SBarry Smith         break;
37459ee18893SBarry Smith       }
37469ee18893SBarry Smith     }
37479ee18893SBarry Smith     diag[x] = d;
37489ee18893SBarry Smith   }
37499ee18893SBarry Smith }
37509ee18893SBarry Smith 
37519ee18893SBarry Smith static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
37529ee18893SBarry Smith {
37539ee18893SBarry Smith   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
37549ee18893SBarry Smith   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
37559ee18893SBarry Smith   PetscScalar                  *darray;
37569ee18893SBarry Smith 
37579ee18893SBarry Smith   PetscFunctionBegin;
37589ee18893SBarry Smith   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
37599ee18893SBarry Smith     PetscInt   n   = A->rmap->n;
37609ee18893SBarry Smith     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
37619ee18893SBarry Smith 
37629ee18893SBarry Smith     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
37639ee18893SBarry Smith     if (n > 0) {
37649ee18893SBarry Smith       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
37659ee18893SBarry Smith       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
37669ee18893SBarry Smith       PetscCallCUDA(cudaPeekAtLastError());
37679ee18893SBarry Smith       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
37689ee18893SBarry Smith     }
37699ee18893SBarry Smith   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
37709ee18893SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
37719ee18893SBarry Smith }
37729ee18893SBarry Smith 
3773d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3774d71ae5a4SJacob Faibussowitsch {
3775042217e8SBarry Smith   PetscFunctionBegin;
37769566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37789ae82921SPaul Mullowney }
37799ae82921SPaul Mullowney 
3780e057df02SPaul Mullowney /*@
378153220ed8SBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
37829ae82921SPaul Mullowney 
3783d083f849SBarry Smith   Collective
37849ae82921SPaul Mullowney 
37859ae82921SPaul Mullowney   Input Parameters:
378611a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37879ae82921SPaul Mullowney . m    - number of rows
37889ae82921SPaul Mullowney . n    - number of columns
378920f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
379020f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37919ae82921SPaul Mullowney 
37929ae82921SPaul Mullowney   Output Parameter:
37939ae82921SPaul Mullowney . A - the matrix
37949ae82921SPaul Mullowney 
37952ef1f0ffSBarry Smith   Level: intermediate
37962ef1f0ffSBarry Smith 
37972ef1f0ffSBarry Smith   Notes:
37982920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37992920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
38002920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
38012920cce0SJacob Faibussowitsch 
380211a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
38039ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
380411a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
38059ae82921SPaul Mullowney 
380611a5261eSBarry Smith   The AIJ format, also called
38072ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
38089ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
380920f4b53cSBarry Smith   either one (as in Fortran) or zero.
38109ae82921SPaul Mullowney 
38119ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
38122ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
381320f4b53cSBarry Smith   allocation.
38149ae82921SPaul Mullowney 
381553220ed8SBarry Smith   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
381653220ed8SBarry Smith 
381753220ed8SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
381853220ed8SBarry Smith           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
38199ae82921SPaul Mullowney @*/
3820d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3821d71ae5a4SJacob Faibussowitsch {
38229ae82921SPaul Mullowney   PetscFunctionBegin;
38239566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
38249566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
38259566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
38269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
38273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38289ae82921SPaul Mullowney }
38299ae82921SPaul Mullowney 
3830d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3831d71ae5a4SJacob Faibussowitsch {
38329ae82921SPaul Mullowney   PetscFunctionBegin;
38339ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
38342c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
38359ae82921SPaul Mullowney   } else {
38369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3837aa372e3fSPaul Mullowney   }
38389566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38399566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38419566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38429566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38439566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38469566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38479566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38489566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38509ae82921SPaul Mullowney }
38519ae82921SPaul Mullowney 
3852ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
385395639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3854d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3855d71ae5a4SJacob Faibussowitsch {
38569ff858a8SKarl Rupp   PetscFunctionBegin;
38579566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38589566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38609ff858a8SKarl Rupp }
38619ff858a8SKarl Rupp 
3862d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3863d71ae5a4SJacob Faibussowitsch {
3864a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3865039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3866039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3867039c6fbaSStefano Zampini   PetscScalar        *ay;
3868039c6fbaSStefano Zampini   const PetscScalar  *ax;
3869039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3870e6e9a74fSStefano Zampini 
387195639643SRichard Tran Mills   PetscFunctionBegin;
3872a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3873a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3874039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38769566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38773ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
387895639643SRichard Tran Mills   }
3879039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38825f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38835f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3884039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3885039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3886039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3887039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3888039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3889ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3890039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3891039c6fbaSStefano Zampini   }
3892d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3893d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3894039c6fbaSStefano Zampini 
3895039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3896039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3897039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3898039c6fbaSStefano Zampini     size_t bufferSize;
3899039c6fbaSStefano Zampini     void  *buffer;
3900039c6fbaSStefano Zampini #endif
3901039c6fbaSStefano Zampini 
39029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39049566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3905039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
39069371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39079371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
39089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
39099566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39109371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39119371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
39129566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3915039c6fbaSStefano Zampini #else
39169566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39179371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39189371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
39199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39209566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3921039c6fbaSStefano Zampini #endif
39229566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
39239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3926039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3927a587d139SMark     cublasHandle_t cublasv2handle;
3928a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3929039c6fbaSStefano Zampini 
39309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39329566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39339566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
39349566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39359566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
39369566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39379566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39389566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39399566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39409566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3941039c6fbaSStefano Zampini   } else {
39429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39439566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3944a587d139SMark   }
39453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
394695639643SRichard Tran Mills }
394795639643SRichard Tran Mills 
3948d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3949d71ae5a4SJacob Faibussowitsch {
395033c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
395133c9ba73SStefano Zampini   PetscScalar   *ay;
395233c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
395333c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
395433c9ba73SStefano Zampini 
395533c9ba73SStefano Zampini   PetscFunctionBegin;
39569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39579566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39589566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39609566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
39653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
396633c9ba73SStefano Zampini }
396733c9ba73SStefano Zampini 
3968d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3969d71ae5a4SJacob Faibussowitsch {
3970f5d0f301SBarry Smith   PetscBool   gpu = PETSC_FALSE;
3971a587d139SMark   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;
39727e8381f9SStefano Zampini 
39733fa6b06aSMark Adams   PetscFunctionBegin;
39743fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39753fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39767e8381f9SStefano Zampini     if (spptr->mat) {
39777e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39787e8381f9SStefano Zampini       if (matrix->values) {
3979f5d0f301SBarry Smith         gpu = PETSC_TRUE;
39807e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39817e8381f9SStefano Zampini       }
39827e8381f9SStefano Zampini     }
39837e8381f9SStefano Zampini     if (spptr->matTranspose) {
39847e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3985ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39867e8381f9SStefano Zampini     }
39873fa6b06aSMark Adams   }
3988f5d0f301SBarry Smith   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3989f5d0f301SBarry Smith   else {
39909566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3991f5d0f301SBarry Smith     A->offloadmask = PETSC_OFFLOAD_CPU;
3992f5d0f301SBarry Smith   }
39939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39953fa6b06aSMark Adams }
39963fa6b06aSMark Adams 
39972c55c4ccSJose E. Roman static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
399803db1824SAlex Lindsay {
399903db1824SAlex Lindsay   PetscFunctionBegin;
400003db1824SAlex Lindsay   *m = PETSC_MEMTYPE_CUDA;
400103db1824SAlex Lindsay   PetscFunctionReturn(PETSC_SUCCESS);
400203db1824SAlex Lindsay }
400303db1824SAlex Lindsay 
4004d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4005d71ae5a4SJacob Faibussowitsch {
4006a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4007a587d139SMark 
4008a587d139SMark   PetscFunctionBegin;
40099a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
40109a14fc28SStefano Zampini     A->boundtocpu = flg;
40113ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
40129a14fc28SStefano Zampini   }
4013a587d139SMark   if (flg) {
40149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4015a587d139SMark 
401633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
40179ee18893SBarry Smith     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4018a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
4019a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4020a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
4021a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4022a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4023a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4024a587d139SMark     A->ops->multhermitiantranspose    = NULL;
4025a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
4026fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
402703db1824SAlex Lindsay     A->ops->getcurrentmemtype         = NULL;
40289566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
40299566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
40309566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
40319566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
40329566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
40339566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
40349566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4035a587d139SMark   } else {
403633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
40379ee18893SBarry Smith     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4038a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4039a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4040a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4041a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4042a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4043a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4044a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4045a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4046fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
404703db1824SAlex Lindsay     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
404867a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
404967a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
405067a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
405167a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
405267a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
405367a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40547ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40557ee59b9bSJunchao Zhang 
40569566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40579566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40589566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40599566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40609566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40619566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4062a587d139SMark   }
4063a587d139SMark   A->boundtocpu = flg;
40644d12350bSJunchao Zhang   if (flg && a->inode.size_csr) {
4065ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4066ea500dcfSRichard Tran Mills   } else {
4067ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4068ea500dcfSRichard Tran Mills   }
40693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4070a587d139SMark }
4071a587d139SMark 
40728eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4073d71ae5a4SJacob Faibussowitsch {
407449735bf3SStefano Zampini   Mat B;
40759ae82921SPaul Mullowney 
40769ae82921SPaul Mullowney   PetscFunctionBegin;
40779566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
407849735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40799566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
408049735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40819566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
408249735bf3SStefano Zampini   }
408349735bf3SStefano Zampini   B = *newmat;
408449735bf3SStefano Zampini 
40859566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40869566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
408734136279SStefano Zampini 
408849735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40899ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4090e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40919566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40929566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40939566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40941a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4095d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4096b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4097a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4098a435da06SStefano Zampini   #else
4099d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4100a435da06SStefano Zampini   #endif
4101d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4102d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4103d8132acaSStefano Zampini #endif
41041a2c6b5cSJunchao Zhang       B->spptr = spptr;
41059ae82921SPaul Mullowney     } else {
4106e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4107e6e9a74fSStefano Zampini 
41089566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
41099566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
41109566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4111e6e9a74fSStefano Zampini       B->spptr = spptr;
41129ae82921SPaul Mullowney     }
4113e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
411449735bf3SStefano Zampini   }
4115693b0035SStefano Zampini   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
41169ae82921SPaul Mullowney   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
41171a2c6b5cSJunchao Zhang   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
41189ae82921SPaul Mullowney   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
411995639643SRichard Tran Mills   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4120693b0035SStefano Zampini   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
412103db1824SAlex Lindsay   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
41222205254eSKarl Rupp 
41239566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
41249566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
41259566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4126ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
41279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4128ae48a8d0SStefano Zampini #endif
41299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
41303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41319ae82921SPaul Mullowney }
41329ae82921SPaul Mullowney 
4133d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4134d71ae5a4SJacob Faibussowitsch {
413502fe1965SBarry Smith   PetscFunctionBegin;
41369566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
41379566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
41383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
413902fe1965SBarry Smith }
414002fe1965SBarry Smith 
41413ca39a21SBarry Smith /*MC
414253220ed8SBarry Smith    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4143e057df02SPaul Mullowney 
4144e057df02SPaul Mullowney    Options Database Keys:
414553220ed8SBarry Smith +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
414653220ed8SBarry Smith .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41472ef1f0ffSBarry Smith                                            Other options include ell (ellpack) or hyb (hybrid).
414853220ed8SBarry Smith .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
414953220ed8SBarry Smith -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU
4150e057df02SPaul Mullowney 
4151e057df02SPaul Mullowney   Level: beginner
4152e057df02SPaul Mullowney 
415353220ed8SBarry Smith   Notes:
415453220ed8SBarry Smith   These matrices can be in either CSR, ELL, or HYB format.
415553220ed8SBarry Smith 
415653220ed8SBarry Smith   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
415753220ed8SBarry Smith 
415853220ed8SBarry Smith   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
415953220ed8SBarry Smith   if some integer values passed in do not fit in `int`.
416053220ed8SBarry Smith 
41611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4162e057df02SPaul Mullowney M*/
41637f756511SDominic Meiser 
4164d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4165d71ae5a4SJacob Faibussowitsch {
416642c9c57cSBarry Smith   PetscFunctionBegin;
41679566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41689566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41699566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41709566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
417242c9c57cSBarry Smith }
417329b38603SBarry Smith 
41742c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4175d71ae5a4SJacob Faibussowitsch {
41762c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4177cbc6b225SStefano Zampini 
4178cbc6b225SStefano Zampini   PetscFunctionBegin;
41792c4ab24aSJunchao Zhang   if (cusp) {
41802c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41812c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41822c4ab24aSJunchao Zhang     delete cusp->workVector;
41832c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41842c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41852c4ab24aSJunchao Zhang     delete cusp->coords;
41862c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41872c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41887f756511SDominic Meiser   }
41893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41907f756511SDominic Meiser }
41917f756511SDominic Meiser 
4192d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4193d71ae5a4SJacob Faibussowitsch {
41947f756511SDominic Meiser   PetscFunctionBegin;
41957f756511SDominic Meiser   if (*mat) {
41967f756511SDominic Meiser     delete (*mat)->values;
41977f756511SDominic Meiser     delete (*mat)->column_indices;
41987f756511SDominic Meiser     delete (*mat)->row_offsets;
41997f756511SDominic Meiser     delete *mat;
42007f756511SDominic Meiser     *mat = 0;
42017f756511SDominic Meiser   }
42023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42037f756511SDominic Meiser }
42047f756511SDominic Meiser 
4205b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4206d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4207d71ae5a4SJacob Faibussowitsch {
42087f756511SDominic Meiser   PetscFunctionBegin;
42097f756511SDominic Meiser   if (*trifactor) {
42109566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4211261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
42129566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
42139566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
42149566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4215afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
42169566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4217afb2bd1cSJunchao Zhang   #endif
42189566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
42197f756511SDominic Meiser   }
42203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42217f756511SDominic Meiser }
4222d460d7bfSJunchao Zhang #endif
42237f756511SDominic Meiser 
4224d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4225d71ae5a4SJacob Faibussowitsch {
42267f756511SDominic Meiser   CsrMatrix *mat;
42277f756511SDominic Meiser 
42287f756511SDominic Meiser   PetscFunctionBegin;
42297f756511SDominic Meiser   if (*matstruct) {
42307f756511SDominic Meiser     if ((*matstruct)->mat) {
42317f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4232afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4233afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4234afb2bd1cSJunchao Zhang #else
42357f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
42369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4237afb2bd1cSJunchao Zhang #endif
42387f756511SDominic Meiser       } else {
42397f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
42403ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
42417f756511SDominic Meiser       }
42427f756511SDominic Meiser     }
42439566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
42447f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
42459566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
42469566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42479566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4248afb2bd1cSJunchao Zhang 
4249afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4250afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42519566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4252fe5544b9SJunchao Zhang 
4253afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4254afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4258fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4259fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4260fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4261fe5544b9SJunchao Zhang   #endif
4262afb2bd1cSJunchao Zhang       }
4263afb2bd1cSJunchao Zhang     }
4264afb2bd1cSJunchao Zhang #endif
42657f756511SDominic Meiser     delete *matstruct;
42667e8381f9SStefano Zampini     *matstruct = NULL;
42677f756511SDominic Meiser   }
42683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42697f756511SDominic Meiser }
42707f756511SDominic Meiser 
4271d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4272d71ae5a4SJacob Faibussowitsch {
4273da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4274da112707SJunchao Zhang 
42757f756511SDominic Meiser   PetscFunctionBegin;
4276da112707SJunchao Zhang   if (fs) {
4277b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4278da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4279da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4280da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4281da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4282d460d7bfSJunchao Zhang     delete fs->workVector;
4283d460d7bfSJunchao Zhang     fs->workVector = NULL;
4284d460d7bfSJunchao Zhang #endif
4285da112707SJunchao Zhang     delete fs->rpermIndices;
4286da112707SJunchao Zhang     delete fs->cpermIndices;
4287da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4288da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4289da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4290b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4291da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4292da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
429330807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
429430807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4295da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4296d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4297da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4298da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
429912ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4300da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4301da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
430212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4303da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4304da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4305da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4306da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4307da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4308da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4309da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4310da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4311da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4312da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4313da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4314da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4315d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4316d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4317d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
431812ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
431912ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4320da112707SJunchao Zhang #endif
4321ccdfe979SStefano Zampini   }
43223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4323ccdfe979SStefano Zampini }
4324ccdfe979SStefano Zampini 
4325d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4326d71ae5a4SJacob Faibussowitsch {
4327ccdfe979SStefano Zampini   PetscFunctionBegin;
4328ccdfe979SStefano Zampini   if (*trifactors) {
43299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4330f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
43319566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
43327f756511SDominic Meiser   }
43333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43347f756511SDominic Meiser }
43357e8381f9SStefano Zampini 
43369371c9d4SSatish Balay struct IJCompare {
4337d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4338d71ae5a4SJacob Faibussowitsch   {
43390b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
43400b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
43417e8381f9SStefano Zampini     return false;
43427e8381f9SStefano Zampini   }
43437e8381f9SStefano Zampini };
43447e8381f9SStefano Zampini 
434566976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4346d71ae5a4SJacob Faibussowitsch {
4347a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4348a49f1ed0SStefano Zampini 
4349a49f1ed0SStefano Zampini   PetscFunctionBegin;
4350a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43513ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4352a49f1ed0SStefano Zampini   if (destroy) {
43539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4354a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4355a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4356a49f1ed0SStefano Zampini   }
43571a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4359a49f1ed0SStefano Zampini }
4360a49f1ed0SStefano Zampini 
436149abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4362d71ae5a4SJacob Faibussowitsch {
436349abdd8aSBarry Smith   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
43644d86920dSPierre Jolivet 
43657e8381f9SStefano Zampini   PetscFunctionBegin;
43662c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43672c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43682c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43707e8381f9SStefano Zampini }
4371ed502f03SStefano Zampini 
437266976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4373d71ae5a4SJacob Faibussowitsch {
43742c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43752c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43762c4ab24aSJunchao Zhang   PetscInt            *i, *j;
437703e76207SPierre Jolivet   PetscContainer       container_h;
43782c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4379219fbbafSJunchao Zhang 
4380219fbbafSJunchao Zhang   PetscFunctionBegin;
43819566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43822c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43832c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43842c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43852c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43862c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43872c4ab24aSJunchao Zhang   } else {
43882c4ab24aSJunchao Zhang     i = coo_i;
43892c4ab24aSJunchao Zhang     j = coo_j;
4390219fbbafSJunchao Zhang   }
4391219fbbafSJunchao Zhang 
43922c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43932c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4394cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43952c4ab24aSJunchao Zhang   // Create the GPU memory
43969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43972c4ab24aSJunchao Zhang 
43982c4ab24aSJunchao Zhang   // Copy the COO struct to device
43992c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
44002c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
44012c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
44022c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
44032c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
44042c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
44052c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
44062c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
44072c4ab24aSJunchao Zhang 
44082c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
440903e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
44103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4411219fbbafSJunchao Zhang }
4412219fbbafSJunchao Zhang 
4413d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4414d71ae5a4SJacob Faibussowitsch {
4415219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4416219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4417b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4418b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4419b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4420b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4421b6c38306SJunchao Zhang   }
4422219fbbafSJunchao Zhang }
4423219fbbafSJunchao Zhang 
442466976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4425d71ae5a4SJacob Faibussowitsch {
4426219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4427219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4429219fbbafSJunchao Zhang   PetscMemType         memtype;
4430219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4431219fbbafSJunchao Zhang   PetscScalar         *Aa;
44322c4ab24aSJunchao Zhang   PetscContainer       container;
44332c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4434219fbbafSJunchao Zhang 
4435219fbbafSJunchao Zhang   PetscFunctionBegin;
44362c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44372c4ab24aSJunchao Zhang 
44382c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
44392c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
44402c4ab24aSJunchao Zhang 
44419566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4442219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44432c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
44442c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4445219fbbafSJunchao Zhang   }
4446219fbbafSJunchao Zhang 
44479566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44489566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4449219fbbafSJunchao Zhang 
445008bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4451cbc6b225SStefano Zampini   if (Annz) {
44526497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4454cbc6b225SStefano Zampini   }
445508bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4456219fbbafSJunchao Zhang 
44579566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44589566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4459219fbbafSJunchao Zhang 
44609566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4462219fbbafSJunchao Zhang }
4463219fbbafSJunchao Zhang 
44645b7e41feSStefano Zampini /*@C
44652ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44665b7e41feSStefano Zampini 
44672ef1f0ffSBarry Smith   Not Collective
44685b7e41feSStefano Zampini 
44695b7e41feSStefano Zampini   Input Parameters:
44705b7e41feSStefano Zampini + A          - the matrix
447111a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44725b7e41feSStefano Zampini 
44735b7e41feSStefano Zampini   Output Parameters:
447453220ed8SBarry Smith + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
447553220ed8SBarry Smith - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
44765b7e41feSStefano Zampini 
44775b7e41feSStefano Zampini   Level: developer
44785b7e41feSStefano Zampini 
447911a5261eSBarry Smith   Note:
44805b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44815b7e41feSStefano Zampini 
44821cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44835b7e41feSStefano Zampini @*/
4484d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4485d71ae5a4SJacob Faibussowitsch {
44865f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44875f101d05SStefano Zampini   CsrMatrix          *csr;
44885f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44895f101d05SStefano Zampini 
44905f101d05SStefano Zampini   PetscFunctionBegin;
44915f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44923ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44935f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4494aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
449628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44975f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44985f101d05SStefano Zampini   if (i) {
44995f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
45005f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
45015f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
45025f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
45039566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
45045f101d05SStefano Zampini       }
45055f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
45065f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
45075f101d05SStefano Zampini   }
45085f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
45093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45105f101d05SStefano Zampini }
45115f101d05SStefano Zampini 
45125b7e41feSStefano Zampini /*@C
45132ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
45145b7e41feSStefano Zampini 
45152ef1f0ffSBarry Smith   Not Collective
45165b7e41feSStefano Zampini 
45175b7e41feSStefano Zampini   Input Parameters:
45185b7e41feSStefano Zampini + A          - the matrix
45192ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
452020f4b53cSBarry Smith . i          - the CSR row pointers
452120f4b53cSBarry Smith - j          - the CSR column indices
45225b7e41feSStefano Zampini 
45235b7e41feSStefano Zampini   Level: developer
45245b7e41feSStefano Zampini 
45251cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
45265b7e41feSStefano Zampini @*/
452720f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4528d71ae5a4SJacob Faibussowitsch {
45295f101d05SStefano Zampini   PetscFunctionBegin;
45305f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45315f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45325f101d05SStefano Zampini   if (i) *i = NULL;
45335f101d05SStefano Zampini   if (j) *j = NULL;
453420f4b53cSBarry Smith   (void)compressed;
45353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45365f101d05SStefano Zampini }
45375f101d05SStefano Zampini 
45385b7e41feSStefano Zampini /*@C
453953220ed8SBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
45405b7e41feSStefano Zampini 
45415b7e41feSStefano Zampini   Not Collective
45425b7e41feSStefano Zampini 
45435b7e41feSStefano Zampini   Input Parameter:
454411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45455b7e41feSStefano Zampini 
45465b7e41feSStefano Zampini   Output Parameter:
45475b7e41feSStefano Zampini . a - pointer to the device data
45485b7e41feSStefano Zampini 
45495b7e41feSStefano Zampini   Level: developer
45505b7e41feSStefano Zampini 
455111a5261eSBarry Smith   Note:
455253220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
45535b7e41feSStefano Zampini 
45541cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45555b7e41feSStefano Zampini @*/
4556d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4557d71ae5a4SJacob Faibussowitsch {
4558ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559ed502f03SStefano Zampini   CsrMatrix          *csr;
4560ed502f03SStefano Zampini 
4561ed502f03SStefano Zampini   PetscFunctionBegin;
4562ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45634f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4564ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
456728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
456928b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570ed502f03SStefano Zampini   *a = csr->values->data().get();
45713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4572ed502f03SStefano Zampini }
4573ed502f03SStefano Zampini 
45745b7e41feSStefano Zampini /*@C
457511a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45765b7e41feSStefano Zampini 
45775b7e41feSStefano Zampini   Not Collective
45785b7e41feSStefano Zampini 
45792ef1f0ffSBarry Smith   Input Parameters:
45802ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45812ef1f0ffSBarry Smith - a - pointer to the device data
45825b7e41feSStefano Zampini 
45835b7e41feSStefano Zampini   Level: developer
45845b7e41feSStefano Zampini 
45851cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45865b7e41feSStefano Zampini @*/
4587d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4588d71ae5a4SJacob Faibussowitsch {
4589ed502f03SStefano Zampini   PetscFunctionBegin;
4590ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45914f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4592ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4593ed502f03SStefano Zampini   *a = NULL;
45943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4595ed502f03SStefano Zampini }
4596ed502f03SStefano Zampini 
45975b7e41feSStefano Zampini /*@C
459811a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45995b7e41feSStefano Zampini 
46005b7e41feSStefano Zampini   Not Collective
46015b7e41feSStefano Zampini 
46025b7e41feSStefano Zampini   Input Parameter:
460311a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46045b7e41feSStefano Zampini 
46055b7e41feSStefano Zampini   Output Parameter:
46065b7e41feSStefano Zampini . a - pointer to the device data
46075b7e41feSStefano Zampini 
46085b7e41feSStefano Zampini   Level: developer
46095b7e41feSStefano Zampini 
461011a5261eSBarry Smith   Note:
461153220ed8SBarry Smith   Will trigger host-to-device copies if the most up-to-date matrix data is on the host
46125b7e41feSStefano Zampini 
46131cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
46145b7e41feSStefano Zampini @*/
4615d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4616d71ae5a4SJacob Faibussowitsch {
4617039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4618039c6fbaSStefano Zampini   CsrMatrix          *csr;
4619039c6fbaSStefano Zampini 
4620039c6fbaSStefano Zampini   PetscFunctionBegin;
4621039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46224f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4623039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
46259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
462628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4627039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
462828b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4629039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4630039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4633039c6fbaSStefano Zampini }
46345b7e41feSStefano Zampini /*@C
463511a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4636039c6fbaSStefano Zampini 
46375b7e41feSStefano Zampini   Not Collective
46385b7e41feSStefano Zampini 
46392ef1f0ffSBarry Smith   Input Parameters:
46402ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46412ef1f0ffSBarry Smith - a - pointer to the device data
46425b7e41feSStefano Zampini 
46435b7e41feSStefano Zampini   Level: developer
46445b7e41feSStefano Zampini 
46451cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46465b7e41feSStefano Zampini @*/
4647d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4648d71ae5a4SJacob Faibussowitsch {
4649039c6fbaSStefano Zampini   PetscFunctionBegin;
4650039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46514f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4652039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46539566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46549566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4655039c6fbaSStefano Zampini   *a = NULL;
46563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4657039c6fbaSStefano Zampini }
4658039c6fbaSStefano Zampini 
46595b7e41feSStefano Zampini /*@C
466011a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46615b7e41feSStefano Zampini 
46625b7e41feSStefano Zampini   Not Collective
46635b7e41feSStefano Zampini 
46645b7e41feSStefano Zampini   Input Parameter:
466511a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46665b7e41feSStefano Zampini 
46675b7e41feSStefano Zampini   Output Parameter:
46685b7e41feSStefano Zampini . a - pointer to the device data
46695b7e41feSStefano Zampini 
46705b7e41feSStefano Zampini   Level: developer
46715b7e41feSStefano Zampini 
467211a5261eSBarry Smith   Note:
467353220ed8SBarry Smith   Does not trigger any host to device copies.
467453220ed8SBarry Smith 
467553220ed8SBarry Smith   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
46765b7e41feSStefano Zampini 
46771cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46785b7e41feSStefano Zampini @*/
4679d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4680d71ae5a4SJacob Faibussowitsch {
4681ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4682ed502f03SStefano Zampini   CsrMatrix          *csr;
4683ed502f03SStefano Zampini 
4684ed502f03SStefano Zampini   PetscFunctionBegin;
4685ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46864f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4687ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4688aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
468928b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4690ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
469128b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4692ed502f03SStefano Zampini   *a             = csr->values->data().get();
4693039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4696ed502f03SStefano Zampini }
4697ed502f03SStefano Zampini 
46985b7e41feSStefano Zampini /*@C
469911a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
47005b7e41feSStefano Zampini 
47015b7e41feSStefano Zampini   Not Collective
47025b7e41feSStefano Zampini 
47032ef1f0ffSBarry Smith   Input Parameters:
47042ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
47052ef1f0ffSBarry Smith - a - pointer to the device data
47065b7e41feSStefano Zampini 
47075b7e41feSStefano Zampini   Level: developer
47085b7e41feSStefano Zampini 
47091cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
47105b7e41feSStefano Zampini @*/
4711d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4712d71ae5a4SJacob Faibussowitsch {
4713ed502f03SStefano Zampini   PetscFunctionBegin;
4714ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
47154f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4716ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
47179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
47189566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4719ed502f03SStefano Zampini   *a = NULL;
47203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4721ed502f03SStefano Zampini }
4722ed502f03SStefano Zampini 
47239371c9d4SSatish Balay struct IJCompare4 {
4724d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4725d71ae5a4SJacob Faibussowitsch   {
47260b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
47270b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4728ed502f03SStefano Zampini     return false;
4729ed502f03SStefano Zampini   }
4730ed502f03SStefano Zampini };
4731ed502f03SStefano Zampini 
47329371c9d4SSatish Balay struct Shift {
4733ed502f03SStefano Zampini   int _shift;
4734ed502f03SStefano Zampini 
4735ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
47369371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4737ed502f03SStefano Zampini };
4738ed502f03SStefano Zampini 
473921afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4740d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4741d71ae5a4SJacob Faibussowitsch {
4742ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4743ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4744ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4745ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4746ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4747ed502f03SStefano Zampini   cusparseStatus_t              stat;
4748ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4749ed502f03SStefano Zampini 
4750ed502f03SStefano Zampini   PetscFunctionBegin;
4751ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4752ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47534f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4754ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4755ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47565f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
475708401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4758aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4759aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4760ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4761ed502f03SStefano Zampini     m = A->rmap->n;
4762ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47639566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47649566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47659566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4766ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4767ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4768ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4769ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4770ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4771ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4772ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4773ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4774ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4775ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4776ed502f03SStefano Zampini     Ccusp->nrows            = m;
4777ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4778ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4779ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4780ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47819566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47829566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47839566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4784f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4785f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4786f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47879566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47889566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47899566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47909566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
479228b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
479328b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4794ed502f03SStefano Zampini 
4795ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4796ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4797ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4798ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4799ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4800ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4801ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4802ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4803ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
48042c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4805ed502f03SStefano Zampini     if (c->nz) {
48062ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
48072ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
48082ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
48092ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
48102ed87e7eSStefano Zampini 
4811ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4812ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4813ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4814ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
48159566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4816ed502f03SStefano Zampini         }
48172ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
48182ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4819ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4820ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4821ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4822ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
48239566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4824ed502f03SStefano Zampini         }
48252ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
48262ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
48279566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48289371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48299371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48309371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48319371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48322ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
48332ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
48342ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
48358909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4836ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4837ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
48388909a122SStefano Zampini #else
48398909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
48408909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48418909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48428909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48438909a122SStefano Zampini #endif
48442ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48452ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48462ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48472ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48482ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48492ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48502c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48512c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4852ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4853792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48548909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48558909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48568909a122SStefano Zampini #endif
48572ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48582ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48592ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4860792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48612ed87e7eSStefano Zampini #else
486259c3d2bbSPierre Jolivet   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
48632ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
486459c3d2bbSPierre Jolivet   #else
486559c3d2bbSPierre Jolivet       auto pred = cuda::std::identity();
486659c3d2bbSPierre Jolivet   #endif
4867792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4868792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48692ed87e7eSStefano Zampini #endif
48709371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48719371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48729566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48732ed87e7eSStefano Zampini       delete wPerm;
48742ed87e7eSStefano Zampini       delete Acoo;
48752ed87e7eSStefano Zampini       delete Bcoo;
48762ed87e7eSStefano Zampini       delete Ccoo;
4877ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48789371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48799371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4880ed502f03SStefano Zampini #endif
48811a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48829566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48839566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4884ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4885ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4886ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4887ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4888ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4889ed502f03SStefano Zampini 
48901a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48911a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4892a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4893ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4894ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4895ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4896ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4897ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4898ed502f03SStefano Zampini 
4899ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4900ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4901ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4902ed502f03SStefano Zampini 
49039566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4904ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4905ed502f03SStefano Zampini         if (AT) {
4906ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4907ed502f03SStefano Zampini           thrust::advance(rT, -1);
4908ed502f03SStefano Zampini         }
4909ed502f03SStefano Zampini         if (BT) {
4910ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4911ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4912ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4913ed502f03SStefano Zampini         }
4914ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4915ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4916ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4917ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4918ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4919ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49209566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4921ed502f03SStefano Zampini 
49229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
49239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
49249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4925f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4926f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4927f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
49289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49309566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4931ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
49329371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
49339371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4934ed502f03SStefano Zampini #endif
4935ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4936ed502f03SStefano Zampini       }
4937ed502f03SStefano Zampini     }
4938ed502f03SStefano Zampini 
4939ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
49409f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
49419f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4942ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
49437de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4944ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4945ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4946ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4947ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49489566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4950ed502f03SStefano Zampini     } else {
49519566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49529566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4953ed502f03SStefano Zampini     }
49549566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49559566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49569566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4957ed502f03SStefano Zampini     c->maxnz         = c->nz;
4958ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4959ed502f03SStefano Zampini     c->rmax          = 0;
4960ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4961ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4962ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4963ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4964ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4965ed502f03SStefano Zampini     }
49669566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
49679566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4968ed502f03SStefano Zampini     (*C)->nonzerostate++;
49699566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49709566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4971ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4972ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4973ed502f03SStefano Zampini   } else {
497408401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4975ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4976ed502f03SStefano Zampini     if (c->nz) {
4977ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49782c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4979aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
498008401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49819566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49829566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49835f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49845f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4985ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4986ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4987ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4988aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4989aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4990aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4991aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49922c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49932c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4994ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49959566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49962c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49979371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4998ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49999371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
50002c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5001ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
50029566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
50031a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
50045f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5005ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5006ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5007ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5008ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5009ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
5010ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5011ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
50121a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
5013ed502f03SStefano Zampini       }
50149566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
5015ed502f03SStefano Zampini     }
5016ed502f03SStefano Zampini   }
50179566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5018ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
5019ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
5020ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
50213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5022ed502f03SStefano Zampini }
5023c215019aSStefano Zampini 
5024d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5025d71ae5a4SJacob Faibussowitsch {
5026c215019aSStefano Zampini   bool               dmem;
5027c215019aSStefano Zampini   const PetscScalar *av;
5028c215019aSStefano Zampini 
5029c215019aSStefano Zampini   PetscFunctionBegin;
5030c215019aSStefano Zampini   dmem = isCudaMem(v);
50319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5032c215019aSStefano Zampini   if (n && idx) {
5033c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5034c215019aSStefano Zampini     widx.assign(idx, idx + n);
50359566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5036c215019aSStefano Zampini 
5037c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5038c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5039c215019aSStefano Zampini     if (dmem) {
5040c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5041c215019aSStefano Zampini     } else {
5042c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5043c215019aSStefano Zampini       dv = w->data();
5044c215019aSStefano Zampini     }
5045c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5046c215019aSStefano Zampini 
5047c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5048c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5049c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
505048a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5051c215019aSStefano Zampini     delete w;
5052c215019aSStefano Zampini   } else {
50539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5054c215019aSStefano Zampini   }
50559566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5058c215019aSStefano Zampini }
5059