xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a0e72f99b781a1b11acf6c40ba4a7ecafcc3e699)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16*a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17bc3f50f2SPaul Mullowney 
18e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22afb2bd1cSJunchao Zhang 
23afb2bd1cSJunchao Zhang   typedef enum {
24afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
25afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
27afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
28afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
43afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
44afb2bd1cSJunchao Zhang 
45afb2bd1cSJunchao Zhang   typedef enum {
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
49afb2bd1cSJunchao Zhang   */
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53afb2bd1cSJunchao Zhang #endif
549ae82921SPaul Mullowney 
55087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58087f3262SPaul Mullowney 
596fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
616fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
666fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
674416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
706fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
736fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
779ae82921SPaul Mullowney 
787f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
847f756511SDominic Meiser 
8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8657181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
87a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8857181aedSStefano Zampini 
897e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
907e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
917e8381f9SStefano Zampini 
92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93c215019aSStefano Zampini 
94b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95b06137fdSPaul Mullowney {
96b06137fdSPaul Mullowney   cusparseStatus_t   stat;
97b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98b06137fdSPaul Mullowney 
99b06137fdSPaul Mullowney   PetscFunctionBegin;
100d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10257d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103b06137fdSPaul Mullowney   PetscFunctionReturn(0);
104b06137fdSPaul Mullowney }
105b06137fdSPaul Mullowney 
106b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107b06137fdSPaul Mullowney {
108b06137fdSPaul Mullowney   cusparseStatus_t   stat;
109b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110b06137fdSPaul Mullowney 
111b06137fdSPaul Mullowney   PetscFunctionBegin;
112d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1136b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11416a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11557d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11616a2e217SAlejandro Lamas Daviña     }
117b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1186b1cf21dSAlejandro Lamas Daviña   }
11957d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120b06137fdSPaul Mullowney   PetscFunctionReturn(0);
121b06137fdSPaul Mullowney }
122b06137fdSPaul Mullowney 
123b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124b06137fdSPaul Mullowney {
125b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1267e8381f9SStefano Zampini   PetscBool          flg;
1277e8381f9SStefano Zampini   PetscErrorCode     ierr;
128ccdfe979SStefano Zampini 
129b06137fdSPaul Mullowney   PetscFunctionBegin;
1307e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1317e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133b06137fdSPaul Mullowney   PetscFunctionReturn(0);
134b06137fdSPaul Mullowney }
135b06137fdSPaul Mullowney 
136ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1379ae82921SPaul Mullowney {
1389ae82921SPaul Mullowney   PetscFunctionBegin;
1399ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1409ae82921SPaul Mullowney   PetscFunctionReturn(0);
1419ae82921SPaul Mullowney }
1429ae82921SPaul Mullowney 
143c708e6cdSJed Brown /*MC
144087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
150c708e6cdSJed Brown 
1519ae82921SPaul Mullowney   Level: beginner
152c708e6cdSJed Brown 
1533ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154c708e6cdSJed Brown M*/
1559ae82921SPaul Mullowney 
15642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1579ae82921SPaul Mullowney {
1589ae82921SPaul Mullowney   PetscErrorCode ierr;
159bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1609ae82921SPaul Mullowney 
1619ae82921SPaul Mullowney   PetscFunctionBegin;
162bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1642c7c0729SBarry Smith   (*B)->factortype = ftype;
1652c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1669ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1672205254eSKarl Rupp 
168087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16933d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1709ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1719ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1759ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
176bc3f50f2SPaul Mullowney 
177fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1783ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1799ae82921SPaul Mullowney   PetscFunctionReturn(0);
1809ae82921SPaul Mullowney }
1819ae82921SPaul Mullowney 
182bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
183ca45077fSPaul Mullowney {
184aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1856e111a19SKarl Rupp 
186ca45077fSPaul Mullowney   PetscFunctionBegin;
187ca45077fSPaul Mullowney   switch (op) {
188e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
189aa372e3fSPaul Mullowney     cusparsestruct->format = format;
190ca45077fSPaul Mullowney     break;
191e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
192aa372e3fSPaul Mullowney     cusparsestruct->format = format;
193ca45077fSPaul Mullowney     break;
194ca45077fSPaul Mullowney   default:
19536d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
196ca45077fSPaul Mullowney   }
197ca45077fSPaul Mullowney   PetscFunctionReturn(0);
198ca45077fSPaul Mullowney }
1999ae82921SPaul Mullowney 
200e057df02SPaul Mullowney /*@
201e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
202e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
203aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
204e057df02SPaul Mullowney    Not Collective
205e057df02SPaul Mullowney 
206e057df02SPaul Mullowney    Input Parameters:
2078468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20836d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2092692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
210e057df02SPaul Mullowney 
211e057df02SPaul Mullowney    Output Parameter:
212e057df02SPaul Mullowney 
213e057df02SPaul Mullowney    Level: intermediate
214e057df02SPaul Mullowney 
2158468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
216e057df02SPaul Mullowney @*/
217e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
218e057df02SPaul Mullowney {
219e057df02SPaul Mullowney   PetscErrorCode ierr;
2206e111a19SKarl Rupp 
221e057df02SPaul Mullowney   PetscFunctionBegin;
222e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
223e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
224e057df02SPaul Mullowney   PetscFunctionReturn(0);
225e057df02SPaul Mullowney }
226e057df02SPaul Mullowney 
227e6e9a74fSStefano Zampini /*@
228e589036eSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
229e6e9a74fSStefano Zampini 
230e6e9a74fSStefano Zampini    Collective on mat
231e6e9a74fSStefano Zampini 
232e6e9a74fSStefano Zampini    Input Parameters:
233e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
234e6e9a74fSStefano Zampini -  transgen - the boolean flag
235e6e9a74fSStefano Zampini 
236e6e9a74fSStefano Zampini    Level: intermediate
237e6e9a74fSStefano Zampini 
238e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
239e6e9a74fSStefano Zampini @*/
240e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
241e6e9a74fSStefano Zampini {
242e6e9a74fSStefano Zampini   PetscErrorCode ierr;
243e6e9a74fSStefano Zampini   PetscBool      flg;
244e6e9a74fSStefano Zampini 
245e6e9a74fSStefano Zampini   PetscFunctionBegin;
246e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
247e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
248e6e9a74fSStefano Zampini   if (flg) {
249e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
25054da937aSStefano Zampini 
251e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
252e6e9a74fSStefano Zampini     cusp->transgen = transgen;
25354da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
254a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
25554da937aSStefano Zampini     }
256e6e9a74fSStefano Zampini   }
257e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
258e6e9a74fSStefano Zampini }
259e6e9a74fSStefano Zampini 
2604416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2619ae82921SPaul Mullowney {
2629ae82921SPaul Mullowney   PetscErrorCode           ierr;
263e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2649ae82921SPaul Mullowney   PetscBool                flg;
265a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2666e111a19SKarl Rupp 
2679ae82921SPaul Mullowney   PetscFunctionBegin;
268e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2699ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
27054da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
27154da937aSStefano Zampini 
27254da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
273afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
274afb2bd1cSJunchao Zhang 
275e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
276a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
277afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
278afb2bd1cSJunchao Zhang 
2794c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
280a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
281afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
282afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
283afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
284afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
285afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
286afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
287afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
288afb2bd1cSJunchao Zhang 
289afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
290afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
291afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
292afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
293afb2bd1cSJunchao Zhang 
294afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
295afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
296afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
297afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
298afb2bd1cSJunchao Zhang    #endif
2994c87dfd4SPaul Mullowney   }
3000af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3019ae82921SPaul Mullowney   PetscFunctionReturn(0);
3029ae82921SPaul Mullowney }
3039ae82921SPaul Mullowney 
3046fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3059ae82921SPaul Mullowney {
306da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3079ae82921SPaul Mullowney   PetscErrorCode               ierr;
3089ae82921SPaul Mullowney 
3099ae82921SPaul Mullowney   PetscFunctionBegin;
310da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3119ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3129ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3139ae82921SPaul Mullowney   PetscFunctionReturn(0);
3149ae82921SPaul Mullowney }
3159ae82921SPaul Mullowney 
3166fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3179ae82921SPaul Mullowney {
318da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3199ae82921SPaul Mullowney   PetscErrorCode               ierr;
3209ae82921SPaul Mullowney 
3219ae82921SPaul Mullowney   PetscFunctionBegin;
322da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3239ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3249ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3259ae82921SPaul Mullowney   PetscFunctionReturn(0);
3269ae82921SPaul Mullowney }
3279ae82921SPaul Mullowney 
328087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
329087f3262SPaul Mullowney {
330da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
331087f3262SPaul Mullowney   PetscErrorCode               ierr;
332087f3262SPaul Mullowney 
333087f3262SPaul Mullowney   PetscFunctionBegin;
334da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
335087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
336087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
337087f3262SPaul Mullowney   PetscFunctionReturn(0);
338087f3262SPaul Mullowney }
339087f3262SPaul Mullowney 
340087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
341087f3262SPaul Mullowney {
342da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
343087f3262SPaul Mullowney   PetscErrorCode               ierr;
344087f3262SPaul Mullowney 
345087f3262SPaul Mullowney   PetscFunctionBegin;
346da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
347087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
348087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
349087f3262SPaul Mullowney   PetscFunctionReturn(0);
350087f3262SPaul Mullowney }
351087f3262SPaul Mullowney 
352087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3539ae82921SPaul Mullowney {
3549ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3559ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3569ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
357aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3589ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3599ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3609ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3619ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3629ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
363b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
36457d48284SJunchao Zhang   cudaError_t                       cerr;
3659ae82921SPaul Mullowney 
3669ae82921SPaul Mullowney   PetscFunctionBegin;
367cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
368c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3699ae82921SPaul Mullowney     try {
3709ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3719ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
372da79fbbcSStefano Zampini       if (!loTriFactor) {
3732cbc15d9SMark         PetscScalar                       *AALo;
3742cbc15d9SMark 
3752cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3769ae82921SPaul Mullowney 
3779ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3809ae82921SPaul Mullowney 
3819ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3829ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3839ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3849ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3859ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3869ae82921SPaul Mullowney         v        = aa;
3879ae82921SPaul Mullowney         vi       = aj;
3889ae82921SPaul Mullowney         offset   = 1;
3899ae82921SPaul Mullowney         rowOffset= 1;
3909ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3919ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
392e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3939ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3949ae82921SPaul Mullowney           rowOffset += nz+1;
3959ae82921SPaul Mullowney 
396580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
397580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3989ae82921SPaul Mullowney 
3999ae82921SPaul Mullowney           offset      += nz;
4009ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4019ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4029ae82921SPaul Mullowney           offset      += 1;
4039ae82921SPaul Mullowney 
4049ae82921SPaul Mullowney           v  += nz;
4059ae82921SPaul Mullowney           vi += nz;
4069ae82921SPaul Mullowney         }
4072205254eSKarl Rupp 
408aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
409da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
410da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
411aa372e3fSPaul Mullowney         /* Create the matrix description */
41257d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
41357d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4141b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
415afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
416afb2bd1cSJunchao Zhang        #else
41757d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
418afb2bd1cSJunchao Zhang        #endif
41957d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
42057d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
421aa372e3fSPaul Mullowney 
422aa372e3fSPaul Mullowney         /* set the operation */
423aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
424aa372e3fSPaul Mullowney 
425aa372e3fSPaul Mullowney         /* set the matrix */
426aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
427aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
430aa372e3fSPaul Mullowney 
431aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
432aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
433aa372e3fSPaul Mullowney 
434aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
435aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
436aa372e3fSPaul Mullowney 
437aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
438aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
439aa372e3fSPaul Mullowney 
440afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
441da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
442afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4431b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
444afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
445afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
446afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
447afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
448afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
449afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
450afb2bd1cSJunchao Zhang       #endif
451afb2bd1cSJunchao Zhang 
452aa372e3fSPaul Mullowney         /* perform the solve analysis */
453aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
454aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
455aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
456afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4571b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
459afb2bd1cSJunchao Zhang                                #endif
460afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
461da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
462da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
463aa372e3fSPaul Mullowney 
464da79fbbcSStefano Zampini         /* assign the pointer */
465aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4662cbc15d9SMark         loTriFactor->AA_h = AALo;
46757d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46857d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4694863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
470da79fbbcSStefano Zampini       } else { /* update values only */
4712cbc15d9SMark         if (!loTriFactor->AA_h) {
4722cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4732cbc15d9SMark         }
474da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4752cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
476da79fbbcSStefano Zampini         v        = aa;
477da79fbbcSStefano Zampini         vi       = aj;
478da79fbbcSStefano Zampini         offset   = 1;
479da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
480da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4812cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
482da79fbbcSStefano Zampini           offset      += nz;
4832cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
484da79fbbcSStefano Zampini           offset      += 1;
485da79fbbcSStefano Zampini           v  += nz;
486da79fbbcSStefano Zampini         }
4872cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
488da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
489da79fbbcSStefano Zampini       }
4909ae82921SPaul Mullowney     } catch(char *ex) {
4919ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4929ae82921SPaul Mullowney     }
4939ae82921SPaul Mullowney   }
4949ae82921SPaul Mullowney   PetscFunctionReturn(0);
4959ae82921SPaul Mullowney }
4969ae82921SPaul Mullowney 
497087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4989ae82921SPaul Mullowney {
4999ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5009ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5019ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5039ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5049ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5059ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5069ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5079ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5089ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50957d48284SJunchao Zhang   cudaError_t                       cerr;
5109ae82921SPaul Mullowney 
5119ae82921SPaul Mullowney   PetscFunctionBegin;
512cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
513c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5149ae82921SPaul Mullowney     try {
5159ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5169ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
517da79fbbcSStefano Zampini       if (!upTriFactor) {
5182cbc15d9SMark         PetscScalar *AAUp;
5192cbc15d9SMark 
5202cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5212cbc15d9SMark 
5229ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
52357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
52457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5259ae82921SPaul Mullowney 
5269ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5279ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5289ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5299ae82921SPaul Mullowney         offset = nzUpper;
5309ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5319ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5329ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5339ae82921SPaul Mullowney 
534e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5359ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5369ae82921SPaul Mullowney 
537e057df02SPaul Mullowney           /* decrement the offset */
5389ae82921SPaul Mullowney           offset -= (nz+1);
5399ae82921SPaul Mullowney 
540e057df02SPaul Mullowney           /* first, set the diagonal elements */
5419ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
54209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5439ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5449ae82921SPaul Mullowney 
545580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
546580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5479ae82921SPaul Mullowney         }
5482205254eSKarl Rupp 
549aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
550da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
551da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5522205254eSKarl Rupp 
553aa372e3fSPaul Mullowney         /* Create the matrix description */
55457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
55557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5561b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
557afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
558afb2bd1cSJunchao Zhang        #else
55957d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
560afb2bd1cSJunchao Zhang        #endif
56157d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
56257d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
563aa372e3fSPaul Mullowney 
564aa372e3fSPaul Mullowney         /* set the operation */
565aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
566aa372e3fSPaul Mullowney 
567aa372e3fSPaul Mullowney         /* set the matrix */
568aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
569aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
570aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
572aa372e3fSPaul Mullowney 
573aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
574aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
575aa372e3fSPaul Mullowney 
576aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
577aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
578aa372e3fSPaul Mullowney 
579aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
580aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
581aa372e3fSPaul Mullowney 
582afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
583da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
584afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5851b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
587afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
588afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
589afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
590afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
591afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
592afb2bd1cSJunchao Zhang       #endif
593afb2bd1cSJunchao Zhang 
594aa372e3fSPaul Mullowney         /* perform the solve analysis */
595aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
596aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
597aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
598afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5991b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
601afb2bd1cSJunchao Zhang                                #endif
602afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
603da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
604da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
605aa372e3fSPaul Mullowney 
606da79fbbcSStefano Zampini         /* assign the pointer */
607aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6082cbc15d9SMark         upTriFactor->AA_h = AAUp;
60957d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
61057d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6114863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
612da79fbbcSStefano Zampini       } else {
6132cbc15d9SMark         if (!upTriFactor->AA_h) {
6142cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6152cbc15d9SMark         }
616da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
617da79fbbcSStefano Zampini         offset = nzUpper;
618da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
619da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
620da79fbbcSStefano Zampini 
621da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
622da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
623da79fbbcSStefano Zampini 
624da79fbbcSStefano Zampini           /* decrement the offset */
625da79fbbcSStefano Zampini           offset -= (nz+1);
626da79fbbcSStefano Zampini 
627da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6282cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6292cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
630da79fbbcSStefano Zampini         }
6312cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
632da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
633da79fbbcSStefano Zampini       }
6349ae82921SPaul Mullowney     } catch(char *ex) {
6359ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6369ae82921SPaul Mullowney     }
6379ae82921SPaul Mullowney   }
6389ae82921SPaul Mullowney   PetscFunctionReturn(0);
6399ae82921SPaul Mullowney }
6409ae82921SPaul Mullowney 
641087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6429ae82921SPaul Mullowney {
6439ae82921SPaul Mullowney   PetscErrorCode               ierr;
6449ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6459ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6469ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6479ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6489ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6499ae82921SPaul Mullowney 
6509ae82921SPaul Mullowney   PetscFunctionBegin;
651da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
652087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
653087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6542205254eSKarl Rupp 
655da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
656aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6579ae82921SPaul Mullowney 
658c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
659e057df02SPaul Mullowney   /* lower triangular indices */
6609ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
661da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
662da79fbbcSStefano Zampini     const PetscInt *r;
663da79fbbcSStefano Zampini 
664da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
665aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
666aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6679ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
668da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
669da79fbbcSStefano Zampini   }
6709ae82921SPaul Mullowney 
671e057df02SPaul Mullowney   /* upper triangular indices */
6729ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
674da79fbbcSStefano Zampini     const PetscInt *c;
675da79fbbcSStefano Zampini 
676da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
677aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
678aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6799ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
680da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
681da79fbbcSStefano Zampini   }
6829ae82921SPaul Mullowney   PetscFunctionReturn(0);
6839ae82921SPaul Mullowney }
6849ae82921SPaul Mullowney 
685087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
686087f3262SPaul Mullowney {
687087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
688087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
689aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
690aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
691087f3262SPaul Mullowney   cusparseStatus_t                  stat;
692087f3262SPaul Mullowney   PetscErrorCode                    ierr;
69357d48284SJunchao Zhang   cudaError_t                       cerr;
694087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
695087f3262SPaul Mullowney   PetscScalar                       *AAUp;
696087f3262SPaul Mullowney   PetscScalar                       *AALo;
697087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
698087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
699087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
700087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
701087f3262SPaul Mullowney 
702087f3262SPaul Mullowney   PetscFunctionBegin;
703cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
704c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
705087f3262SPaul Mullowney     try {
706da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
707da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
708da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
709087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
71057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
71157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
712087f3262SPaul Mullowney 
713087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
714087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
715087f3262SPaul Mullowney         AiUp[n]=nzUpper;
716087f3262SPaul Mullowney         offset = 0;
717087f3262SPaul Mullowney         for (i=0; i<n; i++) {
718087f3262SPaul Mullowney           /* set the pointers */
719087f3262SPaul Mullowney           v  = aa + ai[i];
720087f3262SPaul Mullowney           vj = aj + ai[i];
721087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
722087f3262SPaul Mullowney 
723087f3262SPaul Mullowney           /* first, set the diagonal elements */
724087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
72509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
726087f3262SPaul Mullowney           AiUp[i]      = offset;
72709f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
728087f3262SPaul Mullowney 
729087f3262SPaul Mullowney           offset+=1;
730087f3262SPaul Mullowney           if (nz>0) {
731f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
732580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
733087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
734087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
735087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
736087f3262SPaul Mullowney             }
737087f3262SPaul Mullowney             offset+=nz;
738087f3262SPaul Mullowney           }
739087f3262SPaul Mullowney         }
740087f3262SPaul Mullowney 
741aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
742da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
743da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
744087f3262SPaul Mullowney 
745aa372e3fSPaul Mullowney         /* Create the matrix description */
74657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
74757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7481b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
749afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
750afb2bd1cSJunchao Zhang        #else
75157d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
752afb2bd1cSJunchao Zhang        #endif
75357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
75457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
755087f3262SPaul Mullowney 
756aa372e3fSPaul Mullowney         /* set the matrix */
757aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
758aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
759aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
761aa372e3fSPaul Mullowney 
762aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
763aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
764aa372e3fSPaul Mullowney 
765aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
766aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
767aa372e3fSPaul Mullowney 
768aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
769aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
770aa372e3fSPaul Mullowney 
771afb2bd1cSJunchao Zhang         /* set the operation */
772afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
773afb2bd1cSJunchao Zhang 
774afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
775da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
776afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7771b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
778afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
779afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
780afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
781afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
782afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
783afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
784afb2bd1cSJunchao Zhang       #endif
785afb2bd1cSJunchao Zhang 
786aa372e3fSPaul Mullowney         /* perform the solve analysis */
787aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
788aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
789aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
790afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7911b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
793afb2bd1cSJunchao Zhang                                 #endif
794afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
795da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
796da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
797aa372e3fSPaul Mullowney 
798da79fbbcSStefano Zampini         /* assign the pointer */
799aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
800aa372e3fSPaul Mullowney 
801aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
802da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
803da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
804aa372e3fSPaul Mullowney 
805aa372e3fSPaul Mullowney         /* Create the matrix description */
80657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
80757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8081b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
810afb2bd1cSJunchao Zhang        #else
81157d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
812afb2bd1cSJunchao Zhang        #endif
81357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
81457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
815aa372e3fSPaul Mullowney 
816aa372e3fSPaul Mullowney         /* set the operation */
817aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney         /* set the matrix */
820aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
821aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
822aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
824aa372e3fSPaul Mullowney 
825aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
826aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
827aa372e3fSPaul Mullowney 
828aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
829aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
830aa372e3fSPaul Mullowney 
831aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
832aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
833aa372e3fSPaul Mullowney 
834afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
835da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
836afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8371b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
838afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
839afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
840afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
841afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
842afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
843afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
844afb2bd1cSJunchao Zhang       #endif
845afb2bd1cSJunchao Zhang 
846aa372e3fSPaul Mullowney         /* perform the solve analysis */
847aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
848aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
849aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
850afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8511b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
853afb2bd1cSJunchao Zhang                                 #endif
854afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
855da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
856da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
857aa372e3fSPaul Mullowney 
858da79fbbcSStefano Zampini         /* assign the pointer */
859aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
860087f3262SPaul Mullowney 
861da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
86257d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
86357d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
864da79fbbcSStefano Zampini       } else {
865da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
866da79fbbcSStefano Zampini         offset = 0;
867da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
868da79fbbcSStefano Zampini           /* set the pointers */
869da79fbbcSStefano Zampini           v  = aa + ai[i];
870da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
871da79fbbcSStefano Zampini 
872da79fbbcSStefano Zampini           /* first, set the diagonal elements */
873da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
874da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
875da79fbbcSStefano Zampini 
876da79fbbcSStefano Zampini           offset+=1;
877da79fbbcSStefano Zampini           if (nz>0) {
878da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
879da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
880da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
881da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
882da79fbbcSStefano Zampini             }
883da79fbbcSStefano Zampini             offset+=nz;
884da79fbbcSStefano Zampini           }
885da79fbbcSStefano Zampini         }
886da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
887da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
888da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
889da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
890da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
891da79fbbcSStefano Zampini       }
89257d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
89357d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
894087f3262SPaul Mullowney     } catch(char *ex) {
895087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
896087f3262SPaul Mullowney     }
897087f3262SPaul Mullowney   }
898087f3262SPaul Mullowney   PetscFunctionReturn(0);
899087f3262SPaul Mullowney }
900087f3262SPaul Mullowney 
901087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9029ae82921SPaul Mullowney {
9039ae82921SPaul Mullowney   PetscErrorCode               ierr;
904087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
905087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
906087f3262SPaul Mullowney   IS                           ip = a->row;
907087f3262SPaul Mullowney   PetscBool                    perm_identity;
908087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
909087f3262SPaul Mullowney 
910087f3262SPaul Mullowney   PetscFunctionBegin;
911da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
913da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
914aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
915aa372e3fSPaul Mullowney 
916da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
917da79fbbcSStefano Zampini 
918087f3262SPaul Mullowney   /* lower triangular indices */
919087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
920087f3262SPaul Mullowney   if (!perm_identity) {
9214e4bbfaaSStefano Zampini     IS             iip;
922da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9234e4bbfaaSStefano Zampini 
9244e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9254e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
926da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
927aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
928aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
929aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9304e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9314e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9324e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
933087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
934da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
935da79fbbcSStefano Zampini   }
936087f3262SPaul Mullowney   PetscFunctionReturn(0);
937087f3262SPaul Mullowney }
938087f3262SPaul Mullowney 
9396fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9409ae82921SPaul Mullowney {
9419ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9429ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9439ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
944b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9459ae82921SPaul Mullowney 
9469ae82921SPaul Mullowney   PetscFunctionBegin;
94757181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9489ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
949ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
950e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9519ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9529ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
953bda325fcSPaul Mullowney   if (row_identity && col_identity) {
954bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
955bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9564e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9574e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
958bda325fcSPaul Mullowney   } else {
959bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
960bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9614e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9624e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
963bda325fcSPaul Mullowney   }
9648dc1d2a3SPaul Mullowney 
965e057df02SPaul Mullowney   /* get the triangular factors */
966087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9679ae82921SPaul Mullowney   PetscFunctionReturn(0);
9689ae82921SPaul Mullowney }
9699ae82921SPaul Mullowney 
970087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
971087f3262SPaul Mullowney {
972087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
973087f3262SPaul Mullowney   IS             ip = b->row;
974087f3262SPaul Mullowney   PetscBool      perm_identity;
975b175d8bbSPaul Mullowney   PetscErrorCode ierr;
976087f3262SPaul Mullowney 
977087f3262SPaul Mullowney   PetscFunctionBegin;
97857181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
979087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
980ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
981087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
982087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
983087f3262SPaul Mullowney   if (perm_identity) {
984087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
985087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9864e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9874e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
988087f3262SPaul Mullowney   } else {
989087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
990087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9914e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9924e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
993087f3262SPaul Mullowney   }
994087f3262SPaul Mullowney 
995087f3262SPaul Mullowney   /* get the triangular factors */
996087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
997087f3262SPaul Mullowney   PetscFunctionReturn(0);
998087f3262SPaul Mullowney }
9999ae82921SPaul Mullowney 
1000b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1001bda325fcSPaul Mullowney {
1002bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1003aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1004aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1005da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1006da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1007bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1008aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1009aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1010aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1011aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10121b0a6780SStefano Zampini   cudaError_t                       cerr;
1013da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1014b175d8bbSPaul Mullowney 
1015bda325fcSPaul Mullowney   PetscFunctionBegin;
1016aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1017da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1018da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1019aa372e3fSPaul Mullowney 
1020aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1021aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1022aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1023aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1024aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1025aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1026aa372e3fSPaul Mullowney 
1027aa372e3fSPaul Mullowney   /* Create the matrix description */
102857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
103057d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
103157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
103257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1033aa372e3fSPaul Mullowney 
1034aa372e3fSPaul Mullowney   /* set the operation */
1035aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1036aa372e3fSPaul Mullowney 
1037aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1038aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1039afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1040afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1041aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1042afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1043afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1044afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1045aa372e3fSPaul Mullowney 
1046aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1047afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1048afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1049afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1050afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1051afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1052afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1053afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1054afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1055afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1056afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10571b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1058afb2bd1cSJunchao Zhang #endif
1059afb2bd1cSJunchao Zhang 
1060da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1062aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1063aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1064aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1065aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1066aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1067afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1068afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1069afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1070afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1071afb2bd1cSJunchao Zhang                         #else
1072afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1073afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1074afb2bd1cSJunchao Zhang                         #endif
1075afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1076da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1077da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1078aa372e3fSPaul Mullowney 
1079afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1080da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1081afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10821b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1083afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1084afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1085afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1086afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1087afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1088afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1089afb2bd1cSJunchao Zhang #endif
1090afb2bd1cSJunchao Zhang 
1091afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1092aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1093afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1094afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1095afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10961b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1097afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1098afb2bd1cSJunchao Zhang                           #endif
1099afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1100da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1101da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1102aa372e3fSPaul Mullowney 
1103da79fbbcSStefano Zampini   /* assign the pointer */
1104aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1105aa372e3fSPaul Mullowney 
1106aa372e3fSPaul Mullowney   /*********************************************/
1107aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1108aa372e3fSPaul Mullowney   /*********************************************/
1109aa372e3fSPaul Mullowney 
1110aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1111da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1112da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1113aa372e3fSPaul Mullowney 
1114aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1115aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1116aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1117aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1118aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1119aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1120aa372e3fSPaul Mullowney 
1121aa372e3fSPaul Mullowney   /* Create the matrix description */
112257d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112357d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112457d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112557d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112657d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1127aa372e3fSPaul Mullowney 
1128aa372e3fSPaul Mullowney   /* set the operation */
1129aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1130aa372e3fSPaul Mullowney 
1131aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1132aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1133afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1134afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1135aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1136afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1137afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1138afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1139aa372e3fSPaul Mullowney 
1140aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1141afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1142afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1143afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1144afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1145afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1146afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1147afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1148afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1149afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1150afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1151afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1152afb2bd1cSJunchao Zhang #endif
1153afb2bd1cSJunchao Zhang 
1154da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1156aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1157aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1158aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1159aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1160aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1161afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1162afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1163afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1164afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1165afb2bd1cSJunchao Zhang                         #else
1166afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1167afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1168afb2bd1cSJunchao Zhang                         #endif
1169afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1170da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1171da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1172aa372e3fSPaul Mullowney 
1173afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1174da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1175afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11761b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1177afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1181afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1182afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1183afb2bd1cSJunchao Zhang   #endif
1184afb2bd1cSJunchao Zhang 
1185afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1186aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1187afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1188afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1189afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11901b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1191afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1192afb2bd1cSJunchao Zhang                           #endif
1193afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1194da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1195da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1196aa372e3fSPaul Mullowney 
1197da79fbbcSStefano Zampini   /* assign the pointer */
1198aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1199bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1200bda325fcSPaul Mullowney }
1201bda325fcSPaul Mullowney 
1202a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1203a49f1ed0SStefano Zampini {
1204a49f1ed0SStefano Zampini   __host__ __device__
1205a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1206a49f1ed0SStefano Zampini   {
1207a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1208a49f1ed0SStefano Zampini   }
1209a49f1ed0SStefano Zampini };
1210a49f1ed0SStefano Zampini 
1211b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1212bda325fcSPaul Mullowney {
1213aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1214a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1215bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1216bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1217aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1218b06137fdSPaul Mullowney   cudaError_t                  err;
121985ba7357SStefano Zampini   PetscErrorCode               ierr;
1220b175d8bbSPaul Mullowney 
1221bda325fcSPaul Mullowney   PetscFunctionBegin;
1222a49f1ed0SStefano Zampini   if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1223a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1224a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1225a49f1ed0SStefano Zampini   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1226a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1227a49f1ed0SStefano Zampini   if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1228a49f1ed0SStefano Zampini   if (cusparsestruct->transupdated) PetscFunctionReturn(0);
122985ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1230a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1231a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1232a49f1ed0SStefano Zampini   }
1233a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1234aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123557d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1236aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123757d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
123857d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1239aa372e3fSPaul Mullowney 
1240b06137fdSPaul Mullowney     /* set alpha and beta */
1241afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12427656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12437656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1244afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12457656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12467656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1247b06137fdSPaul Mullowney 
1248aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1249aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1250a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1251554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1252554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1253aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1254a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1255aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1256aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1257a3fdcf43SKarl Rupp 
1258039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
125981902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1260afb2bd1cSJunchao Zhang 
1261afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1263afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1264afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1265afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1266afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1267afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1268afb2bd1cSJunchao Zhang      #endif
1269aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1270afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1271afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1272afb2bd1cSJunchao Zhang    #else
1273aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
127451c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
127551c6d536SStefano Zampini       /* First convert HYB to CSR */
1276aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1277aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1278aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1279aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1280aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1281aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1282aa372e3fSPaul Mullowney 
1283aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1284aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1285aa372e3fSPaul Mullowney                               temp->values->data().get(),
1286aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
128757d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1288aa372e3fSPaul Mullowney 
1289aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1290aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1291aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1292aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1293aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1294aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1295aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1296aa372e3fSPaul Mullowney 
1297aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1298aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1299aa372e3fSPaul Mullowney                               temp->values->data().get(),
1300aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1301aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1302aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1303aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1304aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
130557d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1306aa372e3fSPaul Mullowney 
1307aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1308aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
130957d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1310aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1311aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1312aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1313aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1314aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1315aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
131657d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1317aa372e3fSPaul Mullowney 
1318aa372e3fSPaul Mullowney       /* assign the pointer */
1319aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
1320a49f1ed0SStefano Zampini       cusparsestruct->transupdated = PETSC_TRUE;
1321aa372e3fSPaul Mullowney       /* delete temporaries */
1322aa372e3fSPaul Mullowney       if (tempT) {
1323aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1324aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1325aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1326aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1327087f3262SPaul Mullowney       }
1328aa372e3fSPaul Mullowney       if (temp) {
1329aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1330aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1331aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1332aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1333aa372e3fSPaul Mullowney       }
1334afb2bd1cSJunchao Zhang      #endif
1335aa372e3fSPaul Mullowney     }
1336a49f1ed0SStefano Zampini   }
1337a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1338a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1339a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1340a49f1ed0SStefano Zampini     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1341a49f1ed0SStefano Zampini     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1342a49f1ed0SStefano Zampini     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1343a49f1ed0SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1344a49f1ed0SStefano Zampini     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1345a49f1ed0SStefano Zampini     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1346a49f1ed0SStefano Zampini     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1347a49f1ed0SStefano Zampini     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1348a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1349a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1350a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1351a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1352a49f1ed0SStefano Zampini     }
1353a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1354a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1355a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1356a49f1ed0SStefano Zampini 
1357a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1358a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1359a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1360a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1361a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1362a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1363a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1364a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1365a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1366a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1367a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1368a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1369a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1370a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1371a49f1ed0SStefano Zampini      #endif
1372a49f1ed0SStefano Zampini 
1373a49f1ed0SStefano Zampini       stat = cusparse_csr2csc(cusparsestruct->handle,
1374a49f1ed0SStefano Zampini                               A->rmap->n,A->cmap->n,matrix->num_entries,
1375a49f1ed0SStefano Zampini                               csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(),
1376a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1377a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1379a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
1380a49f1ed0SStefano Zampini                               cusparsestruct->csr2cscAlg, csr2cscBuffer
1381a49f1ed0SStefano Zampini                              #else
1382a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1383a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC, indexBase
1384a49f1ed0SStefano Zampini                              #endif
1385a49f1ed0SStefano Zampini );CHKERRCUSPARSE(stat);
1386a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1387a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1388a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1390a49f1ed0SStefano Zampini      #endif
1391a49f1ed0SStefano Zampini     }
1392a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1393a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1394a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1395a49f1ed0SStefano Zampini   }
139685ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1397213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1398213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1399aa372e3fSPaul Mullowney   /* assign the pointer */
1400aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1401a49f1ed0SStefano Zampini   cusparsestruct->transupdated = PETSC_TRUE;
1402bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1403bda325fcSPaul Mullowney }
1404bda325fcSPaul Mullowney 
1405a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14066fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1407bda325fcSPaul Mullowney {
1408c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1409465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1410465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1411465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1412465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1413bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1414bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1415aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1416aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1417aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1418b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
141957d48284SJunchao Zhang   cudaError_t                           cerr;
1420bda325fcSPaul Mullowney 
1421bda325fcSPaul Mullowney   PetscFunctionBegin;
1422aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1423aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1424bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1425aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427bda325fcSPaul Mullowney   }
1428bda325fcSPaul Mullowney 
1429bda325fcSPaul Mullowney   /* Get the GPU pointers */
1430c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1431c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1432c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1433c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1434bda325fcSPaul Mullowney 
14357a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1436aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1437*a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1438c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1439c41cb2e2SAlejandro Lamas Daviña                xGPU);
1440aa372e3fSPaul Mullowney 
1441aa372e3fSPaul Mullowney   /* First, solve U */
1442aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1443afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14441b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1445afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1446afb2bd1cSJunchao Zhang                       #endif
1447afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1448aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1449aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1450aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1451aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1452afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1454afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1455afb2bd1cSJunchao Zhang                       #endif
1456afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1457aa372e3fSPaul Mullowney 
1458aa372e3fSPaul Mullowney   /* Then, solve L */
1459aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1460afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1463afb2bd1cSJunchao Zhang                       #endif
1464afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1465aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1466aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1467aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1468aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1469afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1471afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1472afb2bd1cSJunchao Zhang                       #endif
1473afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1474aa372e3fSPaul Mullowney 
1475aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1476*a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1477c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1478aa372e3fSPaul Mullowney                tempGPU->begin());
1479aa372e3fSPaul Mullowney 
1480aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1481*a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1482bda325fcSPaul Mullowney 
1483bda325fcSPaul Mullowney   /* restore */
1484c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1485c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
148605035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1487661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1488958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1489bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1490bda325fcSPaul Mullowney }
1491bda325fcSPaul Mullowney 
14926fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1493bda325fcSPaul Mullowney {
1494465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1495465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1496bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1497bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1498aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1499aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1500aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1501b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
150257d48284SJunchao Zhang   cudaError_t                       cerr;
1503bda325fcSPaul Mullowney 
1504bda325fcSPaul Mullowney   PetscFunctionBegin;
1505aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1506aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1507bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1508aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510bda325fcSPaul Mullowney   }
1511bda325fcSPaul Mullowney 
1512bda325fcSPaul Mullowney   /* Get the GPU pointers */
1513c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1514c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1515bda325fcSPaul Mullowney 
15167a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1517aa372e3fSPaul Mullowney   /* First, solve U */
1518aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1519afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15201b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1521afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1522afb2bd1cSJunchao Zhang                       #endif
1523afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1524aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1525aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1526aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1527aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1528afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15291b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1530afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1531afb2bd1cSJunchao Zhang                       #endif
1532afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1533aa372e3fSPaul Mullowney 
1534aa372e3fSPaul Mullowney   /* Then, solve L */
1535aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1536afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15371b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1538afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1539afb2bd1cSJunchao Zhang                       #endif
1540afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1541aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1542aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1543aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1544aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1545afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15461b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1548afb2bd1cSJunchao Zhang                       #endif
1549afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1550bda325fcSPaul Mullowney 
1551bda325fcSPaul Mullowney   /* restore */
1552c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1553c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
155405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1555661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1556958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1557bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1558bda325fcSPaul Mullowney }
1559bda325fcSPaul Mullowney 
15606fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15619ae82921SPaul Mullowney {
1562465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1563465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1564465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1565465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15669ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15679ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1568aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1569aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1570aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1571b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
157257d48284SJunchao Zhang   cudaError_t                           cerr;
15739ae82921SPaul Mullowney 
15749ae82921SPaul Mullowney   PetscFunctionBegin;
1575ebc8f436SDominic Meiser 
1576e057df02SPaul Mullowney   /* Get the GPU pointers */
1577c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1578c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1579c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1580c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15819ae82921SPaul Mullowney 
15827a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1583aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1584*a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1585c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15864e4bbfaaSStefano Zampini                tempGPU->begin());
1587aa372e3fSPaul Mullowney 
1588aa372e3fSPaul Mullowney   /* Next, solve L */
1589aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1590afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15911b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1592afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1593afb2bd1cSJunchao Zhang                       #endif
1594afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1595aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1596aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1597aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1598aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1599afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16001b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1602afb2bd1cSJunchao Zhang                       #endif
1603afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1604aa372e3fSPaul Mullowney 
1605aa372e3fSPaul Mullowney   /* Then, solve U */
1606aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1607afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16081b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1609afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1610afb2bd1cSJunchao Zhang                       #endif
1611afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1612aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1613aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1614aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1615aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1616afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16171b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1618afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1619afb2bd1cSJunchao Zhang                       #endif
1620afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1621aa372e3fSPaul Mullowney 
16224e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1623*a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16244e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16254e4bbfaaSStefano Zampini                xGPU);
16269ae82921SPaul Mullowney 
1627c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1628c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
162905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1630661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1631958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16329ae82921SPaul Mullowney   PetscFunctionReturn(0);
16339ae82921SPaul Mullowney }
16349ae82921SPaul Mullowney 
16356fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16369ae82921SPaul Mullowney {
1637465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1638465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16399ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16409ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1641aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1642aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1643aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1644b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
164557d48284SJunchao Zhang   cudaError_t                       cerr;
16469ae82921SPaul Mullowney 
16479ae82921SPaul Mullowney   PetscFunctionBegin;
1648e057df02SPaul Mullowney   /* Get the GPU pointers */
1649c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1650c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16519ae82921SPaul Mullowney 
16527a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1653aa372e3fSPaul Mullowney   /* First, solve L */
1654aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1655afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16561b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1657afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1658afb2bd1cSJunchao Zhang                       #endif
1659afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1660aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1661aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1662aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1663aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1664afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16651b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1666afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1667afb2bd1cSJunchao Zhang                       #endif
1668afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1669aa372e3fSPaul Mullowney 
1670aa372e3fSPaul Mullowney   /* Next, solve U */
1671aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1672afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16731b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1674afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1675afb2bd1cSJunchao Zhang                       #endif
1676afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1677aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1678aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1679aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1680aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1681afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16821b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1683afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1684afb2bd1cSJunchao Zhang                       #endif
1685afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16869ae82921SPaul Mullowney 
1687c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1688c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
168905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1690661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1691958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16929ae82921SPaul Mullowney   PetscFunctionReturn(0);
16939ae82921SPaul Mullowney }
16949ae82921SPaul Mullowney 
16957e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16967e8381f9SStefano Zampini {
16977e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16987e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16997e8381f9SStefano Zampini   cudaError_t        cerr;
17007e8381f9SStefano Zampini   PetscErrorCode     ierr;
17017e8381f9SStefano Zampini 
17027e8381f9SStefano Zampini   PetscFunctionBegin;
17037e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17047e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17057e8381f9SStefano Zampini 
17067e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17077e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17087e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17097e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17107e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17117e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17127e8381f9SStefano Zampini   }
17137e8381f9SStefano Zampini   PetscFunctionReturn(0);
17147e8381f9SStefano Zampini }
17157e8381f9SStefano Zampini 
17167e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17177e8381f9SStefano Zampini {
17187e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17197e8381f9SStefano Zampini   PetscErrorCode ierr;
17207e8381f9SStefano Zampini 
17217e8381f9SStefano Zampini   PetscFunctionBegin;
17227e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17237e8381f9SStefano Zampini   *array = a->a;
17247e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17257e8381f9SStefano Zampini   PetscFunctionReturn(0);
17267e8381f9SStefano Zampini }
17277e8381f9SStefano Zampini 
17286fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17299ae82921SPaul Mullowney {
1730aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17317c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17329ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1733213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17349ae82921SPaul Mullowney   PetscErrorCode               ierr;
1735aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1736abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1737b06137fdSPaul Mullowney   cudaError_t                  err;
17389ae82921SPaul Mullowney 
17399ae82921SPaul Mullowney   PetscFunctionBegin;
1740fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1741c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1742a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1743a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1744afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
174585ba7357SStefano Zampini 
1746abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
174785ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1748afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
174905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17504863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
175185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1752a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
175334d6c7a5SJose E. Roman     } else {
1754abb89eb1SStefano Zampini       PetscInt nnz;
175585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17567c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1757a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17587c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
175981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1760a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1761a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17629ae82921SPaul Mullowney       try {
17639ae82921SPaul Mullowney         if (a->compressedrow.use) {
17649ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17659ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17669ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17679ae82921SPaul Mullowney         } else {
1768213423ffSJunchao Zhang           m    = A->rmap->n;
1769213423ffSJunchao Zhang           ii   = a->i;
1770e6e9a74fSStefano Zampini           ridx = NULL;
17719ae82921SPaul Mullowney         }
1772abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1773abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1774abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1775abb89eb1SStefano Zampini         else nnz = a->nz;
17769ae82921SPaul Mullowney 
177785ba7357SStefano Zampini         /* create cusparse matrix */
1778abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1779aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
178057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
178157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
178257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17839ae82921SPaul Mullowney 
1784afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17857656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17867656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1787afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17887656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17897656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
179057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1791b06137fdSPaul Mullowney 
1792aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1793aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1794aa372e3fSPaul Mullowney           /* set the matrix */
1795afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1796afb2bd1cSJunchao Zhang           mat->num_rows = m;
1797afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1798abb89eb1SStefano Zampini           mat->num_entries = nnz;
1799afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1800afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18019ae82921SPaul Mullowney 
1802abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1803abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1804aa372e3fSPaul Mullowney 
1805abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1806abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1807aa372e3fSPaul Mullowney 
1808aa372e3fSPaul Mullowney           /* assign the pointer */
1809afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1810afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1811afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1812afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1813afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1814afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1815afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1816afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1817afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1818afb2bd1cSJunchao Zhang           }
1819afb2bd1cSJunchao Zhang          #endif
1820aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1821afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1822afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1823afb2bd1cSJunchao Zhang          #else
1824afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1825afb2bd1cSJunchao Zhang           mat->num_rows = m;
1826afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1827abb89eb1SStefano Zampini           mat->num_entries = nnz;
1828afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1829afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1830aa372e3fSPaul Mullowney 
1831abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1832abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1833aa372e3fSPaul Mullowney 
1834abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1835abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1836aa372e3fSPaul Mullowney 
1837aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
183857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1839aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1840aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1841afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1842afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1843afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1844afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
184557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1846aa372e3fSPaul Mullowney           /* assign the pointer */
1847aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1848aa372e3fSPaul Mullowney 
1849afb2bd1cSJunchao Zhang           if (mat) {
1850afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1851afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1852afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1853afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1854087f3262SPaul Mullowney           }
1855afb2bd1cSJunchao Zhang          #endif
1856087f3262SPaul Mullowney         }
1857ca45077fSPaul Mullowney 
1858aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1859213423ffSJunchao Zhang         if (a->compressedrow.use) {
1860213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1861aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1862aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1863213423ffSJunchao Zhang           tmp = m;
1864213423ffSJunchao Zhang         } else {
1865213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1866213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1867213423ffSJunchao Zhang           tmp = 0;
1868213423ffSJunchao Zhang         }
1869213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1870aa372e3fSPaul Mullowney 
1871aa372e3fSPaul Mullowney         /* assign the pointer */
1872aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18739ae82921SPaul Mullowney       } catch(char *ex) {
18749ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18759ae82921SPaul Mullowney       }
187605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
187785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
187834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
187934d6c7a5SJose E. Roman     }
1880abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18819ae82921SPaul Mullowney   }
18829ae82921SPaul Mullowney   PetscFunctionReturn(0);
18839ae82921SPaul Mullowney }
18849ae82921SPaul Mullowney 
1885c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1886aa372e3fSPaul Mullowney {
1887aa372e3fSPaul Mullowney   template <typename Tuple>
1888aa372e3fSPaul Mullowney   __host__ __device__
1889aa372e3fSPaul Mullowney   void operator()(Tuple t)
1890aa372e3fSPaul Mullowney   {
1891aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1892aa372e3fSPaul Mullowney   }
1893aa372e3fSPaul Mullowney };
1894aa372e3fSPaul Mullowney 
18957e8381f9SStefano Zampini struct VecCUDAEquals
18967e8381f9SStefano Zampini {
18977e8381f9SStefano Zampini   template <typename Tuple>
18987e8381f9SStefano Zampini   __host__ __device__
18997e8381f9SStefano Zampini   void operator()(Tuple t)
19007e8381f9SStefano Zampini   {
19017e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19027e8381f9SStefano Zampini   }
19037e8381f9SStefano Zampini };
19047e8381f9SStefano Zampini 
1905e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1906e6e9a74fSStefano Zampini {
1907e6e9a74fSStefano Zampini   template <typename Tuple>
1908e6e9a74fSStefano Zampini   __host__ __device__
1909e6e9a74fSStefano Zampini   void operator()(Tuple t)
1910e6e9a74fSStefano Zampini   {
1911e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1912e6e9a74fSStefano Zampini   }
1913e6e9a74fSStefano Zampini };
1914e6e9a74fSStefano Zampini 
1915afb2bd1cSJunchao Zhang struct MatMatCusparse {
1916ccdfe979SStefano Zampini   PetscBool             cisdense;
1917ccdfe979SStefano Zampini   PetscScalar           *Bt;
1918ccdfe979SStefano Zampini   Mat                   X;
1919fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1920fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1921fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1922afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1923fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1924afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1925afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1926afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1927afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1928fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1929fcdce8c4SStefano Zampini   void                  *mmBuffer;
1930fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1931fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1932afb2bd1cSJunchao Zhang #endif
1933afb2bd1cSJunchao Zhang };
1934ccdfe979SStefano Zampini 
1935ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1936ccdfe979SStefano Zampini {
1937ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1938ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1939ccdfe979SStefano Zampini   cudaError_t      cerr;
1940fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1941fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1942fcdce8c4SStefano Zampini  #endif
1943ccdfe979SStefano Zampini 
1944ccdfe979SStefano Zampini   PetscFunctionBegin;
1945ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1946fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1947afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1948fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1949fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1950fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1951afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1952afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1953fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1954afb2bd1cSJunchao Zhang  #endif
1955ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1956ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1957ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1958ccdfe979SStefano Zampini }
1959ccdfe979SStefano Zampini 
1960ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1961ccdfe979SStefano Zampini 
1962ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1963ccdfe979SStefano Zampini {
1964ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1965ccdfe979SStefano Zampini   Mat                          A,B;
1966afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1967ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1968ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1969ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1970ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1971ccdfe979SStefano Zampini   const PetscScalar            *barray;
1972ccdfe979SStefano Zampini   PetscScalar                  *carray;
1973ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1974ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1975ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1976ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1977afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1978ccdfe979SStefano Zampini 
1979ccdfe979SStefano Zampini   PetscFunctionBegin;
1980ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1981ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1982ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1983ccdfe979SStefano Zampini   A    = product->A;
1984ccdfe979SStefano Zampini   B    = product->B;
1985ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1986ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1987ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1988ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1989ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1990ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1991ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1992ccdfe979SStefano Zampini   switch (product->type) {
1993ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1994ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1995ccdfe979SStefano Zampini     mat = cusp->mat;
1996ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1997ccdfe979SStefano Zampini     m   = A->rmap->n;
1998ccdfe979SStefano Zampini     n   = B->cmap->n;
1999ccdfe979SStefano Zampini     break;
2000ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2001e6e9a74fSStefano Zampini     if (!cusp->transgen) {
2002e6e9a74fSStefano Zampini       mat = cusp->mat;
2003e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2004e6e9a74fSStefano Zampini     } else {
2005ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2006ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2007ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2008e6e9a74fSStefano Zampini     }
2009ccdfe979SStefano Zampini     m = A->cmap->n;
2010ccdfe979SStefano Zampini     n = B->cmap->n;
2011ccdfe979SStefano Zampini     break;
2012ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2013ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2014ccdfe979SStefano Zampini     mat = cusp->mat;
2015ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2016ccdfe979SStefano Zampini     m   = A->rmap->n;
2017ccdfe979SStefano Zampini     n   = B->rmap->n;
2018ccdfe979SStefano Zampini     break;
2019ccdfe979SStefano Zampini   default:
2020ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2021ccdfe979SStefano Zampini   }
2022ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2023ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2024ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2025ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2026afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2027ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2028afb2bd1cSJunchao Zhang 
2029ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2030c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2031c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2032c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2033c8378d12SStefano Zampini   } else {
2034c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2035c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2036c8378d12SStefano Zampini   }
2037c8378d12SStefano Zampini 
2038c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2039afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2040afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2041fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2042afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2043fcdce8c4SStefano Zampini     size_t mmBufferSize;
2044afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2045afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2046afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2047afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2048afb2bd1cSJunchao Zhang     }
2049c8378d12SStefano Zampini 
2050afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2051afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2052afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2053afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2054afb2bd1cSJunchao Zhang     }
2055afb2bd1cSJunchao Zhang 
2056afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2057afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2058afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2059afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2060afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2061afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2062afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2063afb2bd1cSJunchao Zhang     }
2064afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2065afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2066afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2067fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2068fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2069fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2070fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2071fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2072fcdce8c4SStefano Zampini     }
2073afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2074afb2bd1cSJunchao Zhang   } else {
2075afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2076afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2077afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2078afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2079afb2bd1cSJunchao Zhang   }
2080afb2bd1cSJunchao Zhang 
2081afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2082afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2083afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2084afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2085fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2086afb2bd1cSJunchao Zhang  #else
2087afb2bd1cSJunchao Zhang   PetscInt k;
2088afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2089ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2090ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2091ccdfe979SStefano Zampini     cublasStatus_t cerr;
2092ccdfe979SStefano Zampini 
2093ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2094ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2095ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2096ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2097ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2098ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2099ccdfe979SStefano Zampini     blda = B->cmap->n;
2100afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2101afb2bd1cSJunchao Zhang   } else {
2102afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2103ccdfe979SStefano Zampini   }
2104ccdfe979SStefano Zampini 
2105afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2106ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2107afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2108ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2109ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2110ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2111ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2112ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2113afb2bd1cSJunchao Zhang  #endif
2114afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2115c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2116c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2117ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2118ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2119ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2120ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2121ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2122ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2123ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2124ccdfe979SStefano Zampini   } else {
2125ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2126ccdfe979SStefano Zampini   }
2127ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2128ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2129ccdfe979SStefano Zampini   }
2130ccdfe979SStefano Zampini   if (!biscuda) {
2131ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2132ccdfe979SStefano Zampini   }
2133ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2134ccdfe979SStefano Zampini }
2135ccdfe979SStefano Zampini 
2136ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2137ccdfe979SStefano Zampini {
2138ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2139ccdfe979SStefano Zampini   Mat                A,B;
2140ccdfe979SStefano Zampini   PetscInt           m,n;
2141ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2142ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2143ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2144ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2145ccdfe979SStefano Zampini 
2146ccdfe979SStefano Zampini   PetscFunctionBegin;
2147ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2148ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2149ccdfe979SStefano Zampini   A    = product->A;
2150ccdfe979SStefano Zampini   B    = product->B;
2151ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2152ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2153ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2154ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2155ccdfe979SStefano Zampini   switch (product->type) {
2156ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2157ccdfe979SStefano Zampini     m = A->rmap->n;
2158ccdfe979SStefano Zampini     n = B->cmap->n;
2159ccdfe979SStefano Zampini     break;
2160ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2161ccdfe979SStefano Zampini     m = A->cmap->n;
2162ccdfe979SStefano Zampini     n = B->cmap->n;
2163ccdfe979SStefano Zampini     break;
2164ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2165ccdfe979SStefano Zampini     m = A->rmap->n;
2166ccdfe979SStefano Zampini     n = B->rmap->n;
2167ccdfe979SStefano Zampini     break;
2168ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2169ccdfe979SStefano Zampini     m = B->cmap->n;
2170ccdfe979SStefano Zampini     n = B->cmap->n;
2171ccdfe979SStefano Zampini     break;
2172ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2173ccdfe979SStefano Zampini     m = B->rmap->n;
2174ccdfe979SStefano Zampini     n = B->rmap->n;
2175ccdfe979SStefano Zampini     break;
2176ccdfe979SStefano Zampini   default:
2177ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2178ccdfe979SStefano Zampini   }
2179ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2180ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2181ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2182ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2183ccdfe979SStefano Zampini 
2184ccdfe979SStefano Zampini   /* product data */
2185ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2186ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2187afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2188afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2189ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2190afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2191ccdfe979SStefano Zampini   }
2192afb2bd1cSJunchao Zhang  #endif
2193ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2194ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2195ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2196ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2197ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2198ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2199ccdfe979SStefano Zampini     } else {
2200ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2201ccdfe979SStefano Zampini     }
2202ccdfe979SStefano Zampini   }
2203ccdfe979SStefano Zampini   C->product->data    = mmdata;
2204ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2205ccdfe979SStefano Zampini 
2206ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2207ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2208ccdfe979SStefano Zampini }
2209ccdfe979SStefano Zampini 
2210fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2211ccdfe979SStefano Zampini {
2212ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2213fcdce8c4SStefano Zampini   Mat                          A,B;
2214fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2215fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2216fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2217fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2218fcdce8c4SStefano Zampini   PetscBool                    flg;
2219ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2220fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2221fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2222fcdce8c4SStefano Zampini   MatProductType               ptype;
2223fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2224fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2225fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2226fcdce8c4SStefano Zampini #endif
2227ccdfe979SStefano Zampini 
2228ccdfe979SStefano Zampini   PetscFunctionBegin;
2229ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2230fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2231fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2232fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2233fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2234fcdce8c4SStefano Zampini   A = product->A;
2235fcdce8c4SStefano Zampini   B = product->B;
2236fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2237fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2238fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2239fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2240fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2241fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2242fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2243fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2244fcdce8c4SStefano Zampini     goto finalize;
2245fcdce8c4SStefano Zampini   }
2246fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2247fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2248fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2249fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2250fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2251fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2253fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2254fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2255fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2256fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2257fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2258fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2259fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2260fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2261fcdce8c4SStefano Zampini 
2262fcdce8c4SStefano Zampini   ptype = product->type;
2263fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2264fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2265fcdce8c4SStefano Zampini   switch (ptype) {
2266fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2267fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2268fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2269fcdce8c4SStefano Zampini     break;
2270fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2271fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2272fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2273fcdce8c4SStefano Zampini     break;
2274fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2275fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2276fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2277fcdce8c4SStefano Zampini     break;
2278fcdce8c4SStefano Zampini   default:
2279fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2280fcdce8c4SStefano Zampini   }
2281fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2282fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2283fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2284fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2285fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2286fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2287fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2288fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2289fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2290fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2291fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2292fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2293fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2294fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2295fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2296fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2297fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2298fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2299fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2300fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2301fcdce8c4SStefano Zampini #else
2302fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2303fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2304fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2305fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2306fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2307fcdce8c4SStefano Zampini #endif
2308fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2309fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2310fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2311fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2312fcdce8c4SStefano Zampini finalize:
2313fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2314fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2315fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2316fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2317fcdce8c4SStefano Zampini   c->reallocs         = 0;
2318fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2319fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2320fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2321fcdce8c4SStefano Zampini   C->num_ass++;
2322ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2323ccdfe979SStefano Zampini }
2324fcdce8c4SStefano Zampini 
2325fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2326fcdce8c4SStefano Zampini {
2327fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2328fcdce8c4SStefano Zampini   Mat                          A,B;
2329fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2330fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2331fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2332fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2333fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2334fcdce8c4SStefano Zampini   PetscBool                    flg;
2335fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2336fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2337fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2338fcdce8c4SStefano Zampini   MatProductType               ptype;
2339fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2340fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2341fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2342fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2343fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2344fcdce8c4SStefano Zampini   size_t                       bufSize2;
2345fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2346fcdce8c4SStefano Zampini #else
2347fcdce8c4SStefano Zampini   int                          cnz;
2348fcdce8c4SStefano Zampini #endif
2349fcdce8c4SStefano Zampini 
2350fcdce8c4SStefano Zampini   PetscFunctionBegin;
2351fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2352fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2353fcdce8c4SStefano Zampini   A    = product->A;
2354fcdce8c4SStefano Zampini   B    = product->B;
2355fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2356fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2357fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2358fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2359fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2360fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2361fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2362fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2363fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2364fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2365fcdce8c4SStefano Zampini 
2366fcdce8c4SStefano Zampini   /* product data */
2367fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2368fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2369fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2370fcdce8c4SStefano Zampini 
2371fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2372fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2373fcdce8c4SStefano Zampini   ptype = product->type;
2374fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2375fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2376fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2377fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2378fcdce8c4SStefano Zampini   switch (ptype) {
2379fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2380fcdce8c4SStefano Zampini     m = A->rmap->n;
2381fcdce8c4SStefano Zampini     n = B->cmap->n;
2382fcdce8c4SStefano Zampini     k = A->cmap->n;
2383fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2384fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2385fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2386fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2387fcdce8c4SStefano Zampini     break;
2388fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2389fcdce8c4SStefano Zampini     m = A->cmap->n;
2390fcdce8c4SStefano Zampini     n = B->cmap->n;
2391fcdce8c4SStefano Zampini     k = A->rmap->n;
2392fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2393fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2394fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2395fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2396fcdce8c4SStefano Zampini     break;
2397fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2398fcdce8c4SStefano Zampini     m = A->rmap->n;
2399fcdce8c4SStefano Zampini     n = B->rmap->n;
2400fcdce8c4SStefano Zampini     k = A->cmap->n;
2401fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2402fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2403fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2404fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2405fcdce8c4SStefano Zampini     break;
2406fcdce8c4SStefano Zampini   default:
2407fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2408fcdce8c4SStefano Zampini   }
2409fcdce8c4SStefano Zampini 
2410fcdce8c4SStefano Zampini   /* create cusparse matrix */
2411fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2412fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2413fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2414fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2415fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2416fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2417fcdce8c4SStefano Zampini 
2418fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2419fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2420fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2421fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2422fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2423fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2424fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2425fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2426fcdce8c4SStefano Zampini   } else {
2427fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2428fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2429fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2430fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2431fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2432fcdce8c4SStefano Zampini   }
2433fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2434fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2435fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2436fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2437fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2438fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2439fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2440fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2441fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2442fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2443fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2444fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2445fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2446fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2447fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2448fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2449fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2450fcdce8c4SStefano Zampini     c->nz = 0;
2451fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2452fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2453fcdce8c4SStefano Zampini     goto finalizesym;
2454fcdce8c4SStefano Zampini   }
2455fcdce8c4SStefano Zampini 
2456fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2457fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2458fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2459fcdce8c4SStefano Zampini   if (!biscompressed) {
2460fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2461fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2462fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2463fcdce8c4SStefano Zampini #endif
2464fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2465fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2466fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2467fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2468fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2469fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2470fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2471fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2472fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2473fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2474fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2475fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2476fcdce8c4SStefano Zampini     }
2477fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2478fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2479fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2480fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2481fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2482fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2483fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2484fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2485fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2486fcdce8c4SStefano Zampini     }
2487fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2488fcdce8c4SStefano Zampini #endif
2489fcdce8c4SStefano Zampini   }
2490fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2491fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2492fcdce8c4SStefano Zampini   /* precompute flops count */
2493fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2494fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2495fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2496fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2497fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2498fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2499fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2500fcdce8c4SStefano Zampini       }
2501fcdce8c4SStefano Zampini     }
2502fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2503fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2504fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2505fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2506fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2507fcdce8c4SStefano Zampini     }
2508fcdce8c4SStefano Zampini   } else { /* TODO */
2509fcdce8c4SStefano Zampini     flops = 0.;
2510fcdce8c4SStefano Zampini   }
2511fcdce8c4SStefano Zampini 
2512fcdce8c4SStefano Zampini   mmdata->flops = flops;
2513fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2514fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2515fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2516fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2517fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2518fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2519fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2520fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2521fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2522fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2523fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2524fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2525fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2526bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2527fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2528fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2529fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2530fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2531fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2532fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2533fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2534fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2535fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2536fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2537fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2538fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2539fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2540fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2541fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2542bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2543fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2544fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2545fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2546fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2547fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2548fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2549fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2550fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
255100702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2552fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2553fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2554fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2555fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2556fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2557fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2558fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2559fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2560fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2561fcdce8c4SStefano Zampini #else
2562fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2563fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2564fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2565fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2566fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2567fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2568fcdce8c4SStefano Zampini   c->nz = cnz;
2569fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2570fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2572fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2573fcdce8c4SStefano Zampini 
2574fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2575fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2576fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2577fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2578fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2579fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2580fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2581fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2582fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2583fcdce8c4SStefano Zampini #endif
2584fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2585fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2586fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2587fcdce8c4SStefano Zampini finalizesym:
2588fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2589fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2590fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2591fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2592fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2593fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2594fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2595fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2596fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2597fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2598fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2599fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2600fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2601fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2602fcdce8c4SStefano Zampini   } else {
2603fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2604fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2605fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2606fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2607fcdce8c4SStefano Zampini   }
2608fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2609fcdce8c4SStefano Zampini     PetscInt r = 0;
2610fcdce8c4SStefano Zampini     c->i[0] = 0;
2611fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2612fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2613fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2614fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2615fcdce8c4SStefano Zampini     }
2616fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2617fcdce8c4SStefano Zampini   }
2618fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2619fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2620fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2621fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2622fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2623fcdce8c4SStefano Zampini   c->rmax = 0;
2624fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2625fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2626fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2627fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2628fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2629fcdce8c4SStefano Zampini   }
2630fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2631fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2632fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2633fcdce8c4SStefano Zampini 
2634fcdce8c4SStefano Zampini   C->nonzerostate++;
2635fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2636fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2637fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2638fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2639fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2640fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2641fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2642abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2643fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2644fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2645fcdce8c4SStefano Zampini   }
2646fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2647fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2648fcdce8c4SStefano Zampini }
2649fcdce8c4SStefano Zampini 
2650fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2651fcdce8c4SStefano Zampini 
2652fcdce8c4SStefano Zampini /* handles sparse or dense B */
2653fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2654fcdce8c4SStefano Zampini {
2655fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2656fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2657fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2658fcdce8c4SStefano Zampini 
2659fcdce8c4SStefano Zampini   PetscFunctionBegin;
2660fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2661fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2662abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2663fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2664fcdce8c4SStefano Zampini   }
2665fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2666fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2667fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2668fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2669fcdce8c4SStefano Zampini     }
2670fcdce8c4SStefano Zampini   }
2671fcdce8c4SStefano Zampini   if (isdense) {
2672ccdfe979SStefano Zampini     switch (product->type) {
2673ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2674ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2675ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2676ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2677ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2678fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2679fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2680fcdce8c4SStefano Zampini       } else {
2681fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2682fcdce8c4SStefano Zampini       }
2683fcdce8c4SStefano Zampini       break;
2684fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2685fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2686fcdce8c4SStefano Zampini       break;
2687ccdfe979SStefano Zampini     default:
2688ccdfe979SStefano Zampini       break;
2689ccdfe979SStefano Zampini     }
2690fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2691fcdce8c4SStefano Zampini     switch (product->type) {
2692fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2693fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2694fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2695fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2696fcdce8c4SStefano Zampini       break;
2697fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2698fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2699fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2700fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2701fcdce8c4SStefano Zampini       break;
2702fcdce8c4SStefano Zampini     default:
2703fcdce8c4SStefano Zampini       break;
2704fcdce8c4SStefano Zampini     }
2705fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2706fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2707fcdce8c4SStefano Zampini   }
2708ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2709ccdfe979SStefano Zampini }
2710ccdfe979SStefano Zampini 
27116fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27129ae82921SPaul Mullowney {
2713b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27149ae82921SPaul Mullowney 
27159ae82921SPaul Mullowney   PetscFunctionBegin;
2716e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2717e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2718e6e9a74fSStefano Zampini }
2719e6e9a74fSStefano Zampini 
2720e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2721e6e9a74fSStefano Zampini {
2722e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2723e6e9a74fSStefano Zampini 
2724e6e9a74fSStefano Zampini   PetscFunctionBegin;
2725e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2726e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2727e6e9a74fSStefano Zampini }
2728e6e9a74fSStefano Zampini 
2729e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2730e6e9a74fSStefano Zampini {
2731e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2732e6e9a74fSStefano Zampini 
2733e6e9a74fSStefano Zampini   PetscFunctionBegin;
2734e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2735e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2736e6e9a74fSStefano Zampini }
2737e6e9a74fSStefano Zampini 
2738e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2739e6e9a74fSStefano Zampini {
2740e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2741e6e9a74fSStefano Zampini 
2742e6e9a74fSStefano Zampini   PetscFunctionBegin;
2743e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27449ae82921SPaul Mullowney   PetscFunctionReturn(0);
27459ae82921SPaul Mullowney }
27469ae82921SPaul Mullowney 
27476fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2748ca45077fSPaul Mullowney {
2749b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2750ca45077fSPaul Mullowney 
2751ca45077fSPaul Mullowney   PetscFunctionBegin;
2752e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2753ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2754ca45077fSPaul Mullowney }
2755ca45077fSPaul Mullowney 
2756*a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2757*a0e72f99SJunchao Zhang {
2758*a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2759*a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2760*a0e72f99SJunchao Zhang }
2761*a0e72f99SJunchao Zhang 
2762afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2763e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27649ae82921SPaul Mullowney {
27659ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2766aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27679ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2768e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2769b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
277057d48284SJunchao Zhang   cudaError_t                  cerr;
2771aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2772e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2773e6e9a74fSStefano Zampini   PetscBool                    compressed;
2774afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2775afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2776afb2bd1cSJunchao Zhang #endif
27776e111a19SKarl Rupp 
27789ae82921SPaul Mullowney   PetscFunctionBegin;
2779e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2780e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2781afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2782d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2783e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2784e6e9a74fSStefano Zampini   }
278534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
278634d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2787e6e9a74fSStefano Zampini   if (!trans) {
27889ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2789c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2790e6e9a74fSStefano Zampini   } else {
2791e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2792e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2793e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2794e6e9a74fSStefano Zampini     } else {
2795afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2796e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2797e6e9a74fSStefano Zampini     }
2798e6e9a74fSStefano Zampini   }
2799e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2800e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2801213423ffSJunchao Zhang 
2802e6e9a74fSStefano Zampini   try {
2803e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2804213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2805213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2806afb2bd1cSJunchao Zhang 
280785ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2808e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2809afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2810afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2811afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2812afb2bd1cSJunchao Zhang       */
2813e6e9a74fSStefano Zampini       xptr = xarray;
2814afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2815213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2816afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2817afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2818afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2819afb2bd1cSJunchao Zhang        */
2820afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2821afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2822afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2823afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2824afb2bd1cSJunchao Zhang       }
2825afb2bd1cSJunchao Zhang      #endif
2826e6e9a74fSStefano Zampini     } else {
2827afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2828afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2829afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2830afb2bd1cSJunchao Zhang        */
2831afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2832e6e9a74fSStefano Zampini       dptr = zarray;
2833e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2834afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2835e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2836*a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2837e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2838e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2839e6e9a74fSStefano Zampini       }
2840afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2841afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2842afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2843afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2844afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2845afb2bd1cSJunchao Zhang       }
2846afb2bd1cSJunchao Zhang      #endif
2847e6e9a74fSStefano Zampini     }
28489ae82921SPaul Mullowney 
2849afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2850aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2851afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2852afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2853afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2854afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2855afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2856afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2857afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2858afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2859afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2860afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2861afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2862afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2863afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2864afb2bd1cSJunchao Zhang 
2865afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2866afb2bd1cSJunchao Zhang       } else {
2867afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2868afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2869afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2870afb2bd1cSJunchao Zhang       }
2871afb2bd1cSJunchao Zhang 
2872afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2873afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2874afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2875afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2876afb2bd1cSJunchao Zhang                                beta,
2877afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2878afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2879afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2880afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2881afb2bd1cSJunchao Zhang      #else
28827656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2883e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2884a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2885afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2886aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2887e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
288857d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2889afb2bd1cSJunchao Zhang      #endif
2890aa372e3fSPaul Mullowney     } else {
2891213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2892afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2893afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2894afb2bd1cSJunchao Zhang        #else
2895301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2896e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2897afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2898e6e9a74fSStefano Zampini                                  xptr, beta,
289957d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2900afb2bd1cSJunchao Zhang        #endif
2901a65300a6SPaul Mullowney       }
2902aa372e3fSPaul Mullowney     }
290305035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2904958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2905aa372e3fSPaul Mullowney 
2906e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2907213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2908213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2909213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2910e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2911213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29127656d835SStefano Zampini         }
2913213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2914c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29157656d835SStefano Zampini       }
29167656d835SStefano Zampini 
2917213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2918213423ffSJunchao Zhang       if (compressed) {
2919e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2920*a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2921*a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2922*a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
2923*a0e72f99SJunchao Zhang          */
2924*a0e72f99SJunchao Zhang        #if 0
2925*a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2926*a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2927*a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2928e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2929c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
2930*a0e72f99SJunchao Zhang        #else
2931*a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
2932*a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2933*a0e72f99SJunchao Zhang        #endif
293405035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2935958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2936e6e9a74fSStefano Zampini       }
2937e6e9a74fSStefano Zampini     } else {
2938e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2939e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2940e6e9a74fSStefano Zampini       }
2941e6e9a74fSStefano Zampini     }
2942e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2943213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2944213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29459ae82921SPaul Mullowney   } catch(char *ex) {
29469ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29479ae82921SPaul Mullowney   }
2948e6e9a74fSStefano Zampini   if (yy) {
2949958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2950e6e9a74fSStefano Zampini   } else {
2951e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2952e6e9a74fSStefano Zampini   }
29539ae82921SPaul Mullowney   PetscFunctionReturn(0);
29549ae82921SPaul Mullowney }
29559ae82921SPaul Mullowney 
29566fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2957ca45077fSPaul Mullowney {
2958b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29596e111a19SKarl Rupp 
2960ca45077fSPaul Mullowney   PetscFunctionBegin;
2961e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2962ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2963ca45077fSPaul Mullowney }
2964ca45077fSPaul Mullowney 
29656fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29669ae82921SPaul Mullowney {
29679ae82921SPaul Mullowney   PetscErrorCode              ierr;
2968a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29699ae82921SPaul Mullowney   PetscFunctionBegin;
2970bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29713fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2972bc3f50f2SPaul Mullowney   }
29733fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29743fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2975a587d139SMark   if (d_mat) {
29763fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29773fa6b06aSMark Adams   }
29783fa6b06aSMark Adams 
29799ae82921SPaul Mullowney   PetscFunctionReturn(0);
29809ae82921SPaul Mullowney }
29819ae82921SPaul Mullowney 
29829ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2983e057df02SPaul Mullowney /*@
29849ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2985e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2986e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2987e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2988e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2989e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29909ae82921SPaul Mullowney 
2991d083f849SBarry Smith    Collective
29929ae82921SPaul Mullowney 
29939ae82921SPaul Mullowney    Input Parameters:
29949ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29959ae82921SPaul Mullowney .  m - number of rows
29969ae82921SPaul Mullowney .  n - number of columns
29979ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29989ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29990298fd71SBarry Smith          (possibly different for each row) or NULL
30009ae82921SPaul Mullowney 
30019ae82921SPaul Mullowney    Output Parameter:
30029ae82921SPaul Mullowney .  A - the matrix
30039ae82921SPaul Mullowney 
30049ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
30059ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
30069ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
30079ae82921SPaul Mullowney 
30089ae82921SPaul Mullowney    Notes:
30099ae82921SPaul Mullowney    If nnz is given then nz is ignored
30109ae82921SPaul Mullowney 
30119ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
30129ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
30139ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
30149ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
30159ae82921SPaul Mullowney 
30169ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30170298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30189ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30199ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30209ae82921SPaul Mullowney 
30219ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30229ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30239ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30249ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30259ae82921SPaul Mullowney 
30269ae82921SPaul Mullowney    Level: intermediate
30279ae82921SPaul Mullowney 
3028e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
30299ae82921SPaul Mullowney @*/
30309ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
30319ae82921SPaul Mullowney {
30329ae82921SPaul Mullowney   PetscErrorCode ierr;
30339ae82921SPaul Mullowney 
30349ae82921SPaul Mullowney   PetscFunctionBegin;
30359ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
30369ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
30379ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
30389ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
30399ae82921SPaul Mullowney   PetscFunctionReturn(0);
30409ae82921SPaul Mullowney }
30419ae82921SPaul Mullowney 
30426fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30439ae82921SPaul Mullowney {
30449ae82921SPaul Mullowney   PetscErrorCode              ierr;
30453fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3046ab25e6cbSDominic Meiser 
30479ae82921SPaul Mullowney   PetscFunctionBegin;
30489ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30493fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30503fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3051470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30529ae82921SPaul Mullowney   } else {
3053470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3054aa372e3fSPaul Mullowney   }
30553fa6b06aSMark Adams   if (d_mat) {
30563fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30573fa6b06aSMark Adams     cudaError_t                err;
30583fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30593fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30603fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30613fa6b06aSMark Adams     if (a->compressedrow.use) {
30623fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30633fa6b06aSMark Adams     }
30643fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30653fa6b06aSMark Adams   }
3066c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3067ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3068ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3069ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3070fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3071ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30727e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30737e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30749ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30759ae82921SPaul Mullowney   PetscFunctionReturn(0);
30769ae82921SPaul Mullowney }
30779ae82921SPaul Mullowney 
3078ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
307995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30809ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30819ff858a8SKarl Rupp {
30829ff858a8SKarl Rupp   PetscErrorCode ierr;
30839ff858a8SKarl Rupp 
30849ff858a8SKarl Rupp   PetscFunctionBegin;
30859ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3086ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30879ff858a8SKarl Rupp   PetscFunctionReturn(0);
30889ff858a8SKarl Rupp }
30899ff858a8SKarl Rupp 
3090039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
309195639643SRichard Tran Mills {
3092e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3093a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3094039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3095039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3096039c6fbaSStefano Zampini   PetscScalar        *ay;
3097039c6fbaSStefano Zampini   const PetscScalar  *ax;
3098039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3099039c6fbaSStefano Zampini   cudaError_t        cerr;
3100e6e9a74fSStefano Zampini 
310195639643SRichard Tran Mills   PetscFunctionBegin;
3102a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3103a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3104039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3105a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3106a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3107a587d139SMark     PetscFunctionReturn(0);
310895639643SRichard Tran Mills   }
3109039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3110a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3111a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3112039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3113039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3114039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3115039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3116039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3117039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3118039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3119039c6fbaSStefano Zampini     if (eq) {
3120039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3121039c6fbaSStefano Zampini     }
3122039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3123039c6fbaSStefano Zampini   }
3124d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3125d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3126039c6fbaSStefano Zampini 
3127039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3128039c6fbaSStefano Zampini     cusparseStatus_t stat;
3129039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3130039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3131039c6fbaSStefano Zampini     size_t           bufferSize;
3132039c6fbaSStefano Zampini     void             *buffer;
3133039c6fbaSStefano Zampini #endif
3134039c6fbaSStefano Zampini 
3135039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3136039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3137039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3138039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3139039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3140039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3141039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3142039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3143039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3144039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3145039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3146039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3147039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3148039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3149039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3150039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3151039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3152039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3153039c6fbaSStefano Zampini #else
3154039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3155039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3156039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3157039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3158039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3159039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3160039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3161039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3162039c6fbaSStefano Zampini #endif
3163039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3164039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3165039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3166039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3167039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3168a587d139SMark     cublasHandle_t cublasv2handle;
3169039c6fbaSStefano Zampini     cublasStatus_t berr;
3170a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3171039c6fbaSStefano Zampini 
3172039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3173039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3174a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3175a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3176a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3177039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3178039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3179a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3180a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3181039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3182039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3183a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3184039c6fbaSStefano Zampini   } else {
3185a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3186d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3187a587d139SMark   }
318895639643SRichard Tran Mills   PetscFunctionReturn(0);
318995639643SRichard Tran Mills }
319095639643SRichard Tran Mills 
319133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
319233c9ba73SStefano Zampini {
319333c9ba73SStefano Zampini   PetscErrorCode ierr;
319433c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
319533c9ba73SStefano Zampini   PetscScalar    *ay;
319633c9ba73SStefano Zampini   cudaError_t    cerr;
319733c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
319833c9ba73SStefano Zampini   cublasStatus_t berr;
319933c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
320033c9ba73SStefano Zampini 
320133c9ba73SStefano Zampini   PetscFunctionBegin;
320233c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
320333c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
320433c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
320533c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
320633c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
320733c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
320833c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
320933c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
321033c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
321133c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
321233c9ba73SStefano Zampini   PetscFunctionReturn(0);
321333c9ba73SStefano Zampini }
321433c9ba73SStefano Zampini 
32153fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
32163fa6b06aSMark Adams {
32173fa6b06aSMark Adams   PetscErrorCode             ierr;
32187e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3219a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32207e8381f9SStefano Zampini 
32213fa6b06aSMark Adams   PetscFunctionBegin;
32223fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32233fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32247e8381f9SStefano Zampini     if (spptr->mat) {
32257e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
32267e8381f9SStefano Zampini       if (matrix->values) {
32277e8381f9SStefano Zampini         both = PETSC_TRUE;
32287e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32297e8381f9SStefano Zampini       }
32307e8381f9SStefano Zampini     }
32317e8381f9SStefano Zampini     if (spptr->matTranspose) {
32327e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
32337e8381f9SStefano Zampini       if (matrix->values) {
32347e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32357e8381f9SStefano Zampini       }
32367e8381f9SStefano Zampini     }
32373fa6b06aSMark Adams   }
3238a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3239a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3240a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
32417e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3242a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32433fa6b06aSMark Adams 
32443fa6b06aSMark Adams   PetscFunctionReturn(0);
32453fa6b06aSMark Adams }
32463fa6b06aSMark Adams 
3247a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3248a587d139SMark {
3249a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3250a587d139SMark   PetscErrorCode ierr;
3251a587d139SMark 
3252a587d139SMark   PetscFunctionBegin;
3253a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3254a587d139SMark   if (flg) {
3255a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3256a587d139SMark 
325733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3258a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3259a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3260a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3261a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3262a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3263a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3264a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3265a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3266fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3267c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3268a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3269a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3270a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3271a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3272a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3273fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3274a587d139SMark   } else {
327533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3276a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3277a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3278a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3279a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3280a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3281a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3282a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3283a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3284fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3285c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3286a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3287a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3288a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3289a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3290a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3291fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3292a587d139SMark   }
3293a587d139SMark   A->boundtocpu = flg;
3294a587d139SMark   a->inode.use = flg;
3295a587d139SMark   PetscFunctionReturn(0);
3296a587d139SMark }
3297a587d139SMark 
329849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32999ae82921SPaul Mullowney {
33009ae82921SPaul Mullowney   PetscErrorCode   ierr;
3301aa372e3fSPaul Mullowney   cusparseStatus_t stat;
330249735bf3SStefano Zampini   Mat              B;
33039ae82921SPaul Mullowney 
33049ae82921SPaul Mullowney   PetscFunctionBegin;
3305832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
330649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
330749735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
330849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
330949735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
331049735bf3SStefano Zampini   }
331149735bf3SStefano Zampini   B = *newmat;
331249735bf3SStefano Zampini 
331334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
331434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
331534136279SStefano Zampini 
331649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33179ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3318e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3319e6e9a74fSStefano Zampini 
3320e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3321e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3322e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3323*a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3324e6e9a74fSStefano Zampini       B->spptr = spptr;
33253fa6b06aSMark Adams       spptr->deviceMat = NULL;
33269ae82921SPaul Mullowney     } else {
3327e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3328e6e9a74fSStefano Zampini 
3329e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3330e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3331*a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3332e6e9a74fSStefano Zampini       B->spptr = spptr;
33339ae82921SPaul Mullowney     }
3334e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
333549735bf3SStefano Zampini   }
3336693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
33379ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
33389ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
333995639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3340693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
33412205254eSKarl Rupp 
3342e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
33439ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3344bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33459ae82921SPaul Mullowney   PetscFunctionReturn(0);
33469ae82921SPaul Mullowney }
33479ae82921SPaul Mullowney 
334802fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
334902fe1965SBarry Smith {
335002fe1965SBarry Smith   PetscErrorCode ierr;
335102fe1965SBarry Smith 
335202fe1965SBarry Smith   PetscFunctionBegin;
335302fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33540ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3355afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3356afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3357afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
335802fe1965SBarry Smith   PetscFunctionReturn(0);
335902fe1965SBarry Smith }
336002fe1965SBarry Smith 
33613ca39a21SBarry Smith /*MC
3362e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3363e057df02SPaul Mullowney 
3364e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33652692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33662692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3367e057df02SPaul Mullowney 
3368e057df02SPaul Mullowney    Options Database Keys:
3369e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3370aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3371a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3372e057df02SPaul Mullowney 
3373e057df02SPaul Mullowney   Level: beginner
3374e057df02SPaul Mullowney 
33758468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3376e057df02SPaul Mullowney M*/
33777f756511SDominic Meiser 
337842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
337942c9c57cSBarry Smith 
33800f39cd5aSBarry Smith 
33813ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
338242c9c57cSBarry Smith {
338342c9c57cSBarry Smith   PetscErrorCode ierr;
338442c9c57cSBarry Smith 
338542c9c57cSBarry Smith   PetscFunctionBegin;
33863ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33873ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33883ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
339042c9c57cSBarry Smith   PetscFunctionReturn(0);
339142c9c57cSBarry Smith }
339229b38603SBarry Smith 
3393470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33947f756511SDominic Meiser {
3395e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33967f756511SDominic Meiser   cusparseStatus_t stat;
33977f756511SDominic Meiser 
33987f756511SDominic Meiser   PetscFunctionBegin;
33997f756511SDominic Meiser   if (*cusparsestruct) {
3400e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3401e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
34027f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
340381902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
34047e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
34057e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3406a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
34077e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3408e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
34097f756511SDominic Meiser   }
34107f756511SDominic Meiser   PetscFunctionReturn(0);
34117f756511SDominic Meiser }
34127f756511SDominic Meiser 
34137f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
34147f756511SDominic Meiser {
34157f756511SDominic Meiser   PetscFunctionBegin;
34167f756511SDominic Meiser   if (*mat) {
34177f756511SDominic Meiser     delete (*mat)->values;
34187f756511SDominic Meiser     delete (*mat)->column_indices;
34197f756511SDominic Meiser     delete (*mat)->row_offsets;
34207f756511SDominic Meiser     delete *mat;
34217f756511SDominic Meiser     *mat = 0;
34227f756511SDominic Meiser   }
34237f756511SDominic Meiser   PetscFunctionReturn(0);
34247f756511SDominic Meiser }
34257f756511SDominic Meiser 
3426470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
34277f756511SDominic Meiser {
34287f756511SDominic Meiser   cusparseStatus_t stat;
34297f756511SDominic Meiser   PetscErrorCode   ierr;
34307f756511SDominic Meiser 
34317f756511SDominic Meiser   PetscFunctionBegin;
34327f756511SDominic Meiser   if (*trifactor) {
343357d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3434afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
34357f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
34361b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
34372cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3438afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
34391b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3440afb2bd1cSJunchao Zhang    #endif
3441da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34427f756511SDominic Meiser   }
34437f756511SDominic Meiser   PetscFunctionReturn(0);
34447f756511SDominic Meiser }
34457f756511SDominic Meiser 
3446470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34477f756511SDominic Meiser {
34487f756511SDominic Meiser   CsrMatrix        *mat;
34497f756511SDominic Meiser   cusparseStatus_t stat;
34507f756511SDominic Meiser   cudaError_t      err;
34517f756511SDominic Meiser 
34527f756511SDominic Meiser   PetscFunctionBegin;
34537f756511SDominic Meiser   if (*matstruct) {
34547f756511SDominic Meiser     if ((*matstruct)->mat) {
34557f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3456afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3457afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3458afb2bd1cSJunchao Zhang        #else
34597f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
346057d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3461afb2bd1cSJunchao Zhang        #endif
34627f756511SDominic Meiser       } else {
34637f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34647f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34657f756511SDominic Meiser       }
34667f756511SDominic Meiser     }
346757d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34687f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3469afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34707656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34717656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3472afb2bd1cSJunchao Zhang 
3473afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3474afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3475afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3476afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3477afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3478afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3479afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3480afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3481afb2bd1cSJunchao Zhang       }
3482afb2bd1cSJunchao Zhang     }
3483afb2bd1cSJunchao Zhang    #endif
34847f756511SDominic Meiser     delete *matstruct;
34857e8381f9SStefano Zampini     *matstruct = NULL;
34867f756511SDominic Meiser   }
34877f756511SDominic Meiser   PetscFunctionReturn(0);
34887f756511SDominic Meiser }
34897f756511SDominic Meiser 
3490ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34917f756511SDominic Meiser {
3492e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3493e6e9a74fSStefano Zampini 
34947f756511SDominic Meiser   PetscFunctionBegin;
34957f756511SDominic Meiser   if (*trifactors) {
3496e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3497e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3498e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3499e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
35007f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
35017f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
35027f756511SDominic Meiser     delete (*trifactors)->workVector;
35037e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
35047e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
35057e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3506ccdfe979SStefano Zampini   }
3507ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3508ccdfe979SStefano Zampini }
3509ccdfe979SStefano Zampini 
3510ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3511ccdfe979SStefano Zampini {
3512e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3513ccdfe979SStefano Zampini   cusparseHandle_t handle;
3514ccdfe979SStefano Zampini   cusparseStatus_t stat;
3515ccdfe979SStefano Zampini 
3516ccdfe979SStefano Zampini   PetscFunctionBegin;
3517ccdfe979SStefano Zampini   if (*trifactors) {
3518e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35197f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
352057d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35217f756511SDominic Meiser     }
3522e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
35237f756511SDominic Meiser   }
35247f756511SDominic Meiser   PetscFunctionReturn(0);
35257f756511SDominic Meiser }
35267e8381f9SStefano Zampini 
35277e8381f9SStefano Zampini struct IJCompare
35287e8381f9SStefano Zampini {
35297e8381f9SStefano Zampini   __host__ __device__
35307e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35317e8381f9SStefano Zampini   {
35327e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
35337e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
35347e8381f9SStefano Zampini     return false;
35357e8381f9SStefano Zampini   }
35367e8381f9SStefano Zampini };
35377e8381f9SStefano Zampini 
35387e8381f9SStefano Zampini struct IJEqual
35397e8381f9SStefano Zampini {
35407e8381f9SStefano Zampini   __host__ __device__
35417e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35427e8381f9SStefano Zampini   {
35437e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35447e8381f9SStefano Zampini     return true;
35457e8381f9SStefano Zampini   }
35467e8381f9SStefano Zampini };
35477e8381f9SStefano Zampini 
35487e8381f9SStefano Zampini struct IJDiff
35497e8381f9SStefano Zampini {
35507e8381f9SStefano Zampini   __host__ __device__
35517e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35527e8381f9SStefano Zampini   {
35537e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35547e8381f9SStefano Zampini   }
35557e8381f9SStefano Zampini };
35567e8381f9SStefano Zampini 
35577e8381f9SStefano Zampini struct IJSum
35587e8381f9SStefano Zampini {
35597e8381f9SStefano Zampini   __host__ __device__
35607e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35617e8381f9SStefano Zampini   {
35627e8381f9SStefano Zampini     return t1||t2;
35637e8381f9SStefano Zampini   }
35647e8381f9SStefano Zampini };
35657e8381f9SStefano Zampini 
35667e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3567e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35687e8381f9SStefano Zampini {
35697e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3570fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3571bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
357208391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35737e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35747e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35757e8381f9SStefano Zampini   cudaError_t                           cerr;
35767e8381f9SStefano Zampini   PetscInt                              n;
35777e8381f9SStefano Zampini 
35787e8381f9SStefano Zampini   PetscFunctionBegin;
35797e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35807e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35817e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35827e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35837e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35847e8381f9SStefano Zampini     PetscFunctionReturn(0);
35857e8381f9SStefano Zampini   }
35867e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35877e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3588e61fc153SStefano Zampini   if (!v) {
3589e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3590e61fc153SStefano Zampini     goto finalize;
35917e8381f9SStefano Zampini   }
3592e61fc153SStefano Zampini   n = cusp->cooPerm->size();
359308391a17SStefano Zampini   if (isCudaMem(v)) {
359408391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
359508391a17SStefano Zampini   } else {
3596e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3597e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
359808391a17SStefano Zampini     d_v = cooPerm_v->data();
3599e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
360008391a17SStefano Zampini   }
3601bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3602e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
36037e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3604bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
360508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3606e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3607e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3608e61fc153SStefano Zampini       delete cooPerm_w;
36097e8381f9SStefano Zampini     } else {
361008391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36117e8381f9SStefano Zampini                                                                 matrix->values->begin()));
361208391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36137e8381f9SStefano Zampini                                                                 matrix->values->end()));
36147e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
36157e8381f9SStefano Zampini     }
36167e8381f9SStefano Zampini   } else {
3617e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
361808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3619e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36207e8381f9SStefano Zampini     } else {
362108391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36227e8381f9SStefano Zampini                                                                 matrix->values->begin()));
362308391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36247e8381f9SStefano Zampini                                                                 matrix->values->end()));
36257e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
36267e8381f9SStefano Zampini     }
36277e8381f9SStefano Zampini   }
36287e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3629bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3630e61fc153SStefano Zampini finalize:
3631e61fc153SStefano Zampini   delete cooPerm_v;
36327e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3633e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3634fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3635fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3636fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3637fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3638fcdce8c4SStefano Zampini   a->reallocs         = 0;
3639fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3640fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3641fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3642fcdce8c4SStefano Zampini   A->num_ass++;
36437e8381f9SStefano Zampini   PetscFunctionReturn(0);
36447e8381f9SStefano Zampini }
36457e8381f9SStefano Zampini 
3646a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3647a49f1ed0SStefano Zampini {
3648a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3649a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3650a49f1ed0SStefano Zampini 
3651a49f1ed0SStefano Zampini   PetscFunctionBegin;
3652a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3653a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3654a49f1ed0SStefano Zampini   if (destroy) {
3655a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3656a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3657a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3658a49f1ed0SStefano Zampini   }
3659a49f1ed0SStefano Zampini   cusp->transupdated = PETSC_FALSE;
3660a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3661a49f1ed0SStefano Zampini }
3662a49f1ed0SStefano Zampini 
36637e8381f9SStefano Zampini #include <thrust/binary_search.h>
3664e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36657e8381f9SStefano Zampini {
36667e8381f9SStefano Zampini   PetscErrorCode     ierr;
36677e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36687e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36697e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36707e8381f9SStefano Zampini   cudaError_t        cerr;
36717e8381f9SStefano Zampini 
36727e8381f9SStefano Zampini   PetscFunctionBegin;
36737e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36747e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36757e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36767e8381f9SStefano Zampini   if (n != cooPerm_n) {
36777e8381f9SStefano Zampini     delete cusp->cooPerm;
36787e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36797e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36807e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
36817e8381f9SStefano Zampini   }
36827e8381f9SStefano Zampini   if (n) {
36837e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
36847e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
36857e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
36867e8381f9SStefano Zampini 
36877e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
36887e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36897e8381f9SStefano Zampini 
36907e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36917e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36927e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36937e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36947e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36957e8381f9SStefano Zampini 
369608391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36977e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36987e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
36997e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
37007e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
37017e8381f9SStefano Zampini 
37027e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
37037e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
37047e8381f9SStefano Zampini       delete cusp->cooPerm_a;
37057e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
37067e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
37077e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
37087e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
37097e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
37107e8381f9SStefano Zampini       w[0] = 0;
37117e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
37127e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
37137e8381f9SStefano Zampini     }
37147e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
37157e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
37167e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
37177e8381f9SStefano Zampini                         ii.begin());
371808391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
371908391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37207e8381f9SStefano Zampini 
37217e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
37227e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
37237e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
37247e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
37257e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
37267e8381f9SStefano Zampini     a->i[0] = 0;
37277e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37287e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3729fcdce8c4SStefano Zampini     a->rmax = 0;
37307e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
37317e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
37327e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37337e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
37347e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
37357e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
37367e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
37377e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
37387e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3739fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
37407e8381f9SStefano Zampini     }
3741fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
37427e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
37437e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3744fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
37457e8381f9SStefano Zampini   } else {
37467e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
37477e8381f9SStefano Zampini   }
3748e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
37497e8381f9SStefano Zampini 
37507e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3751e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3752e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
37537e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
37547e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
37557e8381f9SStefano Zampini   A->nonzerostate++;
37567e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3757a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
37587e8381f9SStefano Zampini 
37597e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37607e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37617e8381f9SStefano Zampini   PetscFunctionReturn(0);
37627e8381f9SStefano Zampini }
3763ed502f03SStefano Zampini 
3764ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3765ed502f03SStefano Zampini {
3766ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3767ed502f03SStefano Zampini   CsrMatrix          *csr;
3768ed502f03SStefano Zampini   PetscErrorCode     ierr;
3769ed502f03SStefano Zampini 
3770ed502f03SStefano Zampini   PetscFunctionBegin;
3771ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3772ed502f03SStefano Zampini   PetscValidPointer(a,2);
3773ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3774ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3775ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
377633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3777ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3778ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3779ed502f03SStefano Zampini   *a = csr->values->data().get();
3780ed502f03SStefano Zampini   PetscFunctionReturn(0);
3781ed502f03SStefano Zampini }
3782ed502f03SStefano Zampini 
3783ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3784ed502f03SStefano Zampini {
3785ed502f03SStefano Zampini   PetscFunctionBegin;
3786ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3787ed502f03SStefano Zampini   PetscValidPointer(a,2);
3788ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3789ed502f03SStefano Zampini   *a = NULL;
3790ed502f03SStefano Zampini   PetscFunctionReturn(0);
3791ed502f03SStefano Zampini }
3792ed502f03SStefano Zampini 
3793039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3794039c6fbaSStefano Zampini {
3795039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3796039c6fbaSStefano Zampini   CsrMatrix          *csr;
3797039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3798039c6fbaSStefano Zampini 
3799039c6fbaSStefano Zampini   PetscFunctionBegin;
3800039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3802039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3803039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3804039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
380533c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3806039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3807039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3808039c6fbaSStefano Zampini   *a = csr->values->data().get();
3809039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3810a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3811039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3812039c6fbaSStefano Zampini }
3813039c6fbaSStefano Zampini 
3814039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3815039c6fbaSStefano Zampini {
3816039c6fbaSStefano Zampini   PetscErrorCode ierr;
3817039c6fbaSStefano Zampini 
3818039c6fbaSStefano Zampini   PetscFunctionBegin;
3819039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3820039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3821039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3822039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3823039c6fbaSStefano Zampini   *a = NULL;
3824039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3825039c6fbaSStefano Zampini }
3826039c6fbaSStefano Zampini 
3827ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3828ed502f03SStefano Zampini {
3829ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3830ed502f03SStefano Zampini   CsrMatrix          *csr;
3831a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3832ed502f03SStefano Zampini 
3833ed502f03SStefano Zampini   PetscFunctionBegin;
3834ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3835ed502f03SStefano Zampini   PetscValidPointer(a,2);
3836ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3837ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
383833c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3839ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3840ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3841ed502f03SStefano Zampini   *a = csr->values->data().get();
3842039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3843a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3844ed502f03SStefano Zampini   PetscFunctionReturn(0);
3845ed502f03SStefano Zampini }
3846ed502f03SStefano Zampini 
3847ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3848ed502f03SStefano Zampini {
3849ed502f03SStefano Zampini   PetscErrorCode ierr;
3850ed502f03SStefano Zampini 
3851ed502f03SStefano Zampini   PetscFunctionBegin;
3852ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3853ed502f03SStefano Zampini   PetscValidPointer(a,2);
3854ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3855ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3856ed502f03SStefano Zampini   *a = NULL;
3857ed502f03SStefano Zampini   PetscFunctionReturn(0);
3858ed502f03SStefano Zampini }
3859ed502f03SStefano Zampini 
3860ed502f03SStefano Zampini struct IJCompare4
3861ed502f03SStefano Zampini {
3862ed502f03SStefano Zampini   __host__ __device__
38632ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3864ed502f03SStefano Zampini   {
3865ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3866ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3867ed502f03SStefano Zampini     return false;
3868ed502f03SStefano Zampini   }
3869ed502f03SStefano Zampini };
3870ed502f03SStefano Zampini 
38718909a122SStefano Zampini struct Shift
38728909a122SStefano Zampini {
3873ed502f03SStefano Zampini   int _shift;
3874ed502f03SStefano Zampini 
3875ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3876ed502f03SStefano Zampini   __host__ __device__
3877ed502f03SStefano Zampini   inline int operator() (const int &c)
3878ed502f03SStefano Zampini   {
3879ed502f03SStefano Zampini     return c + _shift;
3880ed502f03SStefano Zampini   }
3881ed502f03SStefano Zampini };
3882ed502f03SStefano Zampini 
3883ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3884ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3885ed502f03SStefano Zampini {
3886ed502f03SStefano Zampini   PetscErrorCode               ierr;
3887ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3888ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3889ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3890ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3891ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3892ed502f03SStefano Zampini   cusparseStatus_t             stat;
3893ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3894ed502f03SStefano Zampini   cudaError_t                  cerr;
3895ed502f03SStefano Zampini 
3896ed502f03SStefano Zampini   PetscFunctionBegin;
3897ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3898ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3899ed502f03SStefano Zampini   PetscValidPointer(C,4);
3900ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3901ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3902ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3903ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3904ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3905ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3906ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3907ed502f03SStefano Zampini     m     = A->rmap->n;
3908ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3909ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3910ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3911ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3912ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3913ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3914ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3915ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3916ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3917ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3918ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3919ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3920ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3921ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3922ed502f03SStefano Zampini     Ccusp->nrows    = m;
3923ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3924ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3925ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3926ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3927ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3928ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3929ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3930ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3931ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3932ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3933ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3934ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3935ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3937ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3938ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3939ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3940ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3941ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3942ed502f03SStefano Zampini 
3943ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3944ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3945ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3946ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3947ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3948ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3949ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3950ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3951ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3952ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3953ed502f03SStefano Zampini     if (c->nz) {
39542ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
39552ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
39562ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
39572ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
39582ed87e7eSStefano Zampini 
3959ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3960ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3961ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3962ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3963ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3964ed502f03SStefano Zampini         }
39652ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39662ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3967ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3968ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3969ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3970ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3971ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3972ed502f03SStefano Zampini         }
39732ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39742ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3975ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39762ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39772ed87e7eSStefano Zampini                               Aroff->data().get(),
39782ed87e7eSStefano Zampini                               Annz,
39792ed87e7eSStefano Zampini                               m,
39802ed87e7eSStefano Zampini                               Acoo->data().get(),
39812ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3982ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
39832ed87e7eSStefano Zampini                               Broff->data().get(),
3984ed502f03SStefano Zampini                               Bnnz,
3985ed502f03SStefano Zampini                               m,
39862ed87e7eSStefano Zampini                               Bcoo->data().get(),
3987ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
39882ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
39892ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
39902ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
39918909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3992ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3993ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39948909a122SStefano Zampini #else
39958909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39968909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39978909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39988909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
39998909a122SStefano Zampini #endif
40002ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
40012ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
40022ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
40032ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
40042ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
40052ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4006ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4007ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4008ed502f03SStefano Zampini       thrust::advance(p2,Annz);
40092ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
40108909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
40118909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
40128909a122SStefano Zampini #endif
40132ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
40142ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
40152ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
40162ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
40172ed87e7eSStefano Zampini #else
40182ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40192ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40202ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40212ed87e7eSStefano Zampini #endif
4022ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
40232ed87e7eSStefano Zampini                               Ccoo->data().get(),
4024ed502f03SStefano Zampini                               c->nz,
4025ed502f03SStefano Zampini                               m,
4026ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4027ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4028ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4029ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40302ed87e7eSStefano Zampini       delete wPerm;
40312ed87e7eSStefano Zampini       delete Acoo;
40322ed87e7eSStefano Zampini       delete Bcoo;
40332ed87e7eSStefano Zampini       delete Ccoo;
4034ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4035ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4036ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4037ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4038ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4039ed502f03SStefano Zampini #endif
4040ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
4041ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4042ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4043ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4044ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4045ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4046ed502f03SStefano Zampini 
4047ed502f03SStefano Zampini         Ccusp->transgen = PETSC_TRUE;
4048a49f1ed0SStefano Zampini         Ccusp->transupdated = PETSC_TRUE;
4049a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4050ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4051ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4052ed502f03SStefano Zampini         CcsrT->num_rows = n;
4053ed502f03SStefano Zampini         CcsrT->num_cols = m;
4054ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4055ed502f03SStefano Zampini 
4056ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4057ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4058ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4059ed502f03SStefano Zampini 
4060ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4061ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4062ed502f03SStefano Zampini         if (AT) {
4063ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4064ed502f03SStefano Zampini           thrust::advance(rT,-1);
4065ed502f03SStefano Zampini         }
4066ed502f03SStefano Zampini         if (BT) {
4067ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4068ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4069ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4070ed502f03SStefano Zampini         }
4071ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4072ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4073ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4074ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4075ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4076ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4077ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4078ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4079ed502f03SStefano Zampini 
4080ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4081ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4082ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4083ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4084ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4085ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4086ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4087ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4088ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4090ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4091ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4092ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4093ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4094ed502f03SStefano Zampini #endif
4095ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4096ed502f03SStefano Zampini       }
4097ed502f03SStefano Zampini     }
4098ed502f03SStefano Zampini 
4099ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4100ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4101ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4102ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4103ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4104ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4105ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4106ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4107ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4108ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4109ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4110ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4111ed502f03SStefano Zampini     } else {
4112ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4113ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4114ed502f03SStefano Zampini     }
4115ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4116ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4117ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4118ed502f03SStefano Zampini     c->maxnz = c->nz;
4119ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4120ed502f03SStefano Zampini     c->rmax = 0;
4121ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4122ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4123ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4124ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4125ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4126ed502f03SStefano Zampini     }
4127ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4128ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4129ed502f03SStefano Zampini     (*C)->nonzerostate++;
4130ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4131ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4132ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4133ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4134ed502f03SStefano Zampini   } else {
4135ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4136ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4137ed502f03SStefano Zampini     if (c->nz) {
4138ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4139ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4140ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4141ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4142ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4143ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4144ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4145ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4146ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4147ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4148ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4149ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4150ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4151ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4152ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4153ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4154ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4155ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4156ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4157ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4158ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4159ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4160ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4161ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4162ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4163ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4164ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4165ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4166ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4167a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4168ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4169ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4170ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4171ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4172ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4173ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4174ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4175ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4176ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4177a49f1ed0SStefano Zampini         Ccusp->transupdated = PETSC_TRUE;
4178ed502f03SStefano Zampini       }
4179ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4180ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4181ed502f03SStefano Zampini     }
4182ed502f03SStefano Zampini   }
4183ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4184ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4185ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4186ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4187ed502f03SStefano Zampini   PetscFunctionReturn(0);
4188ed502f03SStefano Zampini }
4189c215019aSStefano Zampini 
4190c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4191c215019aSStefano Zampini {
4192c215019aSStefano Zampini   PetscErrorCode    ierr;
4193c215019aSStefano Zampini   bool              dmem;
4194c215019aSStefano Zampini   const PetscScalar *av;
4195c215019aSStefano Zampini   cudaError_t       cerr;
4196c215019aSStefano Zampini 
4197c215019aSStefano Zampini   PetscFunctionBegin;
4198c215019aSStefano Zampini   dmem = isCudaMem(v);
4199c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4200c215019aSStefano Zampini   if (n && idx) {
4201c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4202c215019aSStefano Zampini     widx.assign(idx,idx+n);
4203c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4204c215019aSStefano Zampini 
4205c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4206c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4207c215019aSStefano Zampini     if (dmem) {
4208c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4209c215019aSStefano Zampini     } else {
4210c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4211c215019aSStefano Zampini       dv = w->data();
4212c215019aSStefano Zampini     }
4213c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4214c215019aSStefano Zampini 
4215c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4216c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4217c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4218c215019aSStefano Zampini     if (w) {
4219c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4220c215019aSStefano Zampini     }
4221c215019aSStefano Zampini     delete w;
4222c215019aSStefano Zampini   } else {
4223c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4224c215019aSStefano Zampini   }
4225c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4226c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4227c215019aSStefano Zampini   PetscFunctionReturn(0);
4228c215019aSStefano Zampini }
4229