xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d8132acae049d2aac17efe57fbaf1fe35e283edb)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17bc3f50f2SPaul Mullowney 
18e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22afb2bd1cSJunchao Zhang 
23afb2bd1cSJunchao Zhang   typedef enum {
24afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
25afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
27afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
28afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
43afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
44afb2bd1cSJunchao Zhang 
45afb2bd1cSJunchao Zhang   typedef enum {
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
49afb2bd1cSJunchao Zhang   */
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53afb2bd1cSJunchao Zhang #endif
549ae82921SPaul Mullowney 
55087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58087f3262SPaul Mullowney 
596fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
616fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
666fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
674416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
706fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
736fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
779ae82921SPaul Mullowney 
787f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
847f756511SDominic Meiser 
8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8657181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
87a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8857181aedSStefano Zampini 
897e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
907e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
917e8381f9SStefano Zampini 
92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93c215019aSStefano Zampini 
94b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95b06137fdSPaul Mullowney {
96b06137fdSPaul Mullowney   cusparseStatus_t   stat;
97b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98b06137fdSPaul Mullowney 
99b06137fdSPaul Mullowney   PetscFunctionBegin;
100d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10257d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103b06137fdSPaul Mullowney   PetscFunctionReturn(0);
104b06137fdSPaul Mullowney }
105b06137fdSPaul Mullowney 
106b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107b06137fdSPaul Mullowney {
108b06137fdSPaul Mullowney   cusparseStatus_t   stat;
109b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110b06137fdSPaul Mullowney 
111b06137fdSPaul Mullowney   PetscFunctionBegin;
112d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1136b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11416a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11557d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11616a2e217SAlejandro Lamas Daviña     }
117b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1186b1cf21dSAlejandro Lamas Daviña   }
11957d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120b06137fdSPaul Mullowney   PetscFunctionReturn(0);
121b06137fdSPaul Mullowney }
122b06137fdSPaul Mullowney 
123b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124b06137fdSPaul Mullowney {
125b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1267e8381f9SStefano Zampini   PetscBool          flg;
1277e8381f9SStefano Zampini   PetscErrorCode     ierr;
128ccdfe979SStefano Zampini 
129b06137fdSPaul Mullowney   PetscFunctionBegin;
1307e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1317e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133b06137fdSPaul Mullowney   PetscFunctionReturn(0);
134b06137fdSPaul Mullowney }
135b06137fdSPaul Mullowney 
136ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1379ae82921SPaul Mullowney {
1389ae82921SPaul Mullowney   PetscFunctionBegin;
1399ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1409ae82921SPaul Mullowney   PetscFunctionReturn(0);
1419ae82921SPaul Mullowney }
1429ae82921SPaul Mullowney 
143c708e6cdSJed Brown /*MC
144087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
150c708e6cdSJed Brown 
1519ae82921SPaul Mullowney   Level: beginner
152c708e6cdSJed Brown 
1533ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154c708e6cdSJed Brown M*/
1559ae82921SPaul Mullowney 
15642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1579ae82921SPaul Mullowney {
1589ae82921SPaul Mullowney   PetscErrorCode ierr;
159bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1609ae82921SPaul Mullowney 
1619ae82921SPaul Mullowney   PetscFunctionBegin;
162bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1642c7c0729SBarry Smith   (*B)->factortype = ftype;
1652c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1669ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1672205254eSKarl Rupp 
168087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16933d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1709ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1719ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1759ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
176bc3f50f2SPaul Mullowney 
177fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1783ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1799ae82921SPaul Mullowney   PetscFunctionReturn(0);
1809ae82921SPaul Mullowney }
1819ae82921SPaul Mullowney 
182bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
183ca45077fSPaul Mullowney {
184aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1856e111a19SKarl Rupp 
186ca45077fSPaul Mullowney   PetscFunctionBegin;
187ca45077fSPaul Mullowney   switch (op) {
188e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
189aa372e3fSPaul Mullowney     cusparsestruct->format = format;
190ca45077fSPaul Mullowney     break;
191e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
192aa372e3fSPaul Mullowney     cusparsestruct->format = format;
193ca45077fSPaul Mullowney     break;
194ca45077fSPaul Mullowney   default:
19536d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
196ca45077fSPaul Mullowney   }
197ca45077fSPaul Mullowney   PetscFunctionReturn(0);
198ca45077fSPaul Mullowney }
1999ae82921SPaul Mullowney 
200e057df02SPaul Mullowney /*@
201e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
202e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
203aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
204e057df02SPaul Mullowney    Not Collective
205e057df02SPaul Mullowney 
206e057df02SPaul Mullowney    Input Parameters:
2078468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20836d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2092692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
210e057df02SPaul Mullowney 
211e057df02SPaul Mullowney    Output Parameter:
212e057df02SPaul Mullowney 
213e057df02SPaul Mullowney    Level: intermediate
214e057df02SPaul Mullowney 
2158468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
216e057df02SPaul Mullowney @*/
217e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
218e057df02SPaul Mullowney {
219e057df02SPaul Mullowney   PetscErrorCode ierr;
2206e111a19SKarl Rupp 
221e057df02SPaul Mullowney   PetscFunctionBegin;
222e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
223e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
224e057df02SPaul Mullowney   PetscFunctionReturn(0);
225e057df02SPaul Mullowney }
226e057df02SPaul Mullowney 
227e6e9a74fSStefano Zampini /*@
228e589036eSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
229e6e9a74fSStefano Zampini 
230e6e9a74fSStefano Zampini    Collective on mat
231e6e9a74fSStefano Zampini 
232e6e9a74fSStefano Zampini    Input Parameters:
233e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
234e6e9a74fSStefano Zampini -  transgen - the boolean flag
235e6e9a74fSStefano Zampini 
236e6e9a74fSStefano Zampini    Level: intermediate
237e6e9a74fSStefano Zampini 
238e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
239e6e9a74fSStefano Zampini @*/
240e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
241e6e9a74fSStefano Zampini {
242e6e9a74fSStefano Zampini   PetscErrorCode ierr;
243e6e9a74fSStefano Zampini   PetscBool      flg;
244e6e9a74fSStefano Zampini 
245e6e9a74fSStefano Zampini   PetscFunctionBegin;
246e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
247e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
248e6e9a74fSStefano Zampini   if (flg) {
249e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
25054da937aSStefano Zampini 
251e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
252e6e9a74fSStefano Zampini     cusp->transgen = transgen;
25354da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
254a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
25554da937aSStefano Zampini     }
256e6e9a74fSStefano Zampini   }
257e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
258e6e9a74fSStefano Zampini }
259e6e9a74fSStefano Zampini 
2604416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2619ae82921SPaul Mullowney {
2629ae82921SPaul Mullowney   PetscErrorCode           ierr;
263e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2649ae82921SPaul Mullowney   PetscBool                flg;
265a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2666e111a19SKarl Rupp 
2679ae82921SPaul Mullowney   PetscFunctionBegin;
268e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2699ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
27054da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
27154da937aSStefano Zampini 
27254da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
273afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
274afb2bd1cSJunchao Zhang 
275e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
276a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
277afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
278afb2bd1cSJunchao Zhang 
2794c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
280a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
281afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
282afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
283afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
284afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
285afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
286afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
287afb2bd1cSJunchao Zhang 
288afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
289afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
290afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
291afb2bd1cSJunchao Zhang 
292afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
293afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
294afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
295afb2bd1cSJunchao Zhang    #endif
2964c87dfd4SPaul Mullowney   }
2970af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2989ae82921SPaul Mullowney   PetscFunctionReturn(0);
2999ae82921SPaul Mullowney }
3009ae82921SPaul Mullowney 
3016fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3029ae82921SPaul Mullowney {
303da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3049ae82921SPaul Mullowney   PetscErrorCode               ierr;
3059ae82921SPaul Mullowney 
3069ae82921SPaul Mullowney   PetscFunctionBegin;
307da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3089ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3099ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3109ae82921SPaul Mullowney   PetscFunctionReturn(0);
3119ae82921SPaul Mullowney }
3129ae82921SPaul Mullowney 
3136fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3149ae82921SPaul Mullowney {
315da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3169ae82921SPaul Mullowney   PetscErrorCode               ierr;
3179ae82921SPaul Mullowney 
3189ae82921SPaul Mullowney   PetscFunctionBegin;
319da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3209ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3219ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3229ae82921SPaul Mullowney   PetscFunctionReturn(0);
3239ae82921SPaul Mullowney }
3249ae82921SPaul Mullowney 
325087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326087f3262SPaul Mullowney {
327da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
328087f3262SPaul Mullowney   PetscErrorCode               ierr;
329087f3262SPaul Mullowney 
330087f3262SPaul Mullowney   PetscFunctionBegin;
331da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
332087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
333087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
334087f3262SPaul Mullowney   PetscFunctionReturn(0);
335087f3262SPaul Mullowney }
336087f3262SPaul Mullowney 
337087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
338087f3262SPaul Mullowney {
339da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
340087f3262SPaul Mullowney   PetscErrorCode               ierr;
341087f3262SPaul Mullowney 
342087f3262SPaul Mullowney   PetscFunctionBegin;
343da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
344087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
345087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
346087f3262SPaul Mullowney   PetscFunctionReturn(0);
347087f3262SPaul Mullowney }
348087f3262SPaul Mullowney 
349087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3509ae82921SPaul Mullowney {
3519ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3529ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3539ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
354aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3559ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3569ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3579ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3589ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3599ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
360b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
36157d48284SJunchao Zhang   cudaError_t                       cerr;
3629ae82921SPaul Mullowney 
3639ae82921SPaul Mullowney   PetscFunctionBegin;
364cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
365c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3669ae82921SPaul Mullowney     try {
3679ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3689ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
369da79fbbcSStefano Zampini       if (!loTriFactor) {
3702cbc15d9SMark         PetscScalar                       *AALo;
3712cbc15d9SMark 
3722cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3739ae82921SPaul Mullowney 
3749ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3779ae82921SPaul Mullowney 
3789ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3799ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3809ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3819ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3829ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3839ae82921SPaul Mullowney         v        = aa;
3849ae82921SPaul Mullowney         vi       = aj;
3859ae82921SPaul Mullowney         offset   = 1;
3869ae82921SPaul Mullowney         rowOffset= 1;
3879ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3889ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
389e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3909ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3919ae82921SPaul Mullowney           rowOffset += nz+1;
3929ae82921SPaul Mullowney 
393580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
394580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney           offset      += nz;
3979ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3989ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3999ae82921SPaul Mullowney           offset      += 1;
4009ae82921SPaul Mullowney 
4019ae82921SPaul Mullowney           v  += nz;
4029ae82921SPaul Mullowney           vi += nz;
4039ae82921SPaul Mullowney         }
4042205254eSKarl Rupp 
405aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
406da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
407da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
408aa372e3fSPaul Mullowney         /* Create the matrix description */
40957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
41057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4111b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
412afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
413afb2bd1cSJunchao Zhang        #else
41457d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
415afb2bd1cSJunchao Zhang        #endif
41657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
418aa372e3fSPaul Mullowney 
419aa372e3fSPaul Mullowney         /* set the operation */
420aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421aa372e3fSPaul Mullowney 
422aa372e3fSPaul Mullowney         /* set the matrix */
423aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
424aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
425aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
427aa372e3fSPaul Mullowney 
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
430aa372e3fSPaul Mullowney 
431aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
432aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
433aa372e3fSPaul Mullowney 
434aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
435aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
436aa372e3fSPaul Mullowney 
437afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
438da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
439afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4401b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
441afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
442afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
443afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
444afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
445afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
446afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
447afb2bd1cSJunchao Zhang       #endif
448afb2bd1cSJunchao Zhang 
449aa372e3fSPaul Mullowney         /* perform the solve analysis */
450aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
451aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
452aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
453afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4541b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
455afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
456afb2bd1cSJunchao Zhang                                #endif
457afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
458da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
459da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
460aa372e3fSPaul Mullowney 
461da79fbbcSStefano Zampini         /* assign the pointer */
462aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4632cbc15d9SMark         loTriFactor->AA_h = AALo;
46457d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46557d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4664863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
467da79fbbcSStefano Zampini       } else { /* update values only */
4682cbc15d9SMark         if (!loTriFactor->AA_h) {
4692cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4702cbc15d9SMark         }
471da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4722cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
473da79fbbcSStefano Zampini         v        = aa;
474da79fbbcSStefano Zampini         vi       = aj;
475da79fbbcSStefano Zampini         offset   = 1;
476da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
477da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4782cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
479da79fbbcSStefano Zampini           offset      += nz;
4802cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
481da79fbbcSStefano Zampini           offset      += 1;
482da79fbbcSStefano Zampini           v  += nz;
483da79fbbcSStefano Zampini         }
4842cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
485da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
486da79fbbcSStefano Zampini       }
4879ae82921SPaul Mullowney     } catch(char *ex) {
4889ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4899ae82921SPaul Mullowney     }
4909ae82921SPaul Mullowney   }
4919ae82921SPaul Mullowney   PetscFunctionReturn(0);
4929ae82921SPaul Mullowney }
4939ae82921SPaul Mullowney 
494087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4959ae82921SPaul Mullowney {
4969ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4979ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4989ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
499aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5009ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5019ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5029ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5039ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5049ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5059ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50657d48284SJunchao Zhang   cudaError_t                       cerr;
5079ae82921SPaul Mullowney 
5089ae82921SPaul Mullowney   PetscFunctionBegin;
509cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
510c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5119ae82921SPaul Mullowney     try {
5129ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5139ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
514da79fbbcSStefano Zampini       if (!upTriFactor) {
5152cbc15d9SMark         PetscScalar *AAUp;
5162cbc15d9SMark 
5172cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5182cbc15d9SMark 
5199ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
52057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
52157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5229ae82921SPaul Mullowney 
5239ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5249ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5259ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5269ae82921SPaul Mullowney         offset = nzUpper;
5279ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5289ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5299ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5309ae82921SPaul Mullowney 
531e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5329ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5339ae82921SPaul Mullowney 
534e057df02SPaul Mullowney           /* decrement the offset */
5359ae82921SPaul Mullowney           offset -= (nz+1);
5369ae82921SPaul Mullowney 
537e057df02SPaul Mullowney           /* first, set the diagonal elements */
5389ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
53909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5409ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5419ae82921SPaul Mullowney 
542580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
543580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5449ae82921SPaul Mullowney         }
5452205254eSKarl Rupp 
546aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
547da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
548da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5492205254eSKarl Rupp 
550aa372e3fSPaul Mullowney         /* Create the matrix description */
55157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
55257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5531b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
554afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
555afb2bd1cSJunchao Zhang        #else
55657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
557afb2bd1cSJunchao Zhang        #endif
55857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
55957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
560aa372e3fSPaul Mullowney 
561aa372e3fSPaul Mullowney         /* set the operation */
562aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
563aa372e3fSPaul Mullowney 
564aa372e3fSPaul Mullowney         /* set the matrix */
565aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
566aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
567aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
569aa372e3fSPaul Mullowney 
570aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
572aa372e3fSPaul Mullowney 
573aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
574aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
575aa372e3fSPaul Mullowney 
576aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
577aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
578aa372e3fSPaul Mullowney 
579afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
580da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
581afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5821b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
583afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
584afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
585afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
586afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
587afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
588afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
589afb2bd1cSJunchao Zhang       #endif
590afb2bd1cSJunchao Zhang 
591aa372e3fSPaul Mullowney         /* perform the solve analysis */
592aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5961b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
597afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
598afb2bd1cSJunchao Zhang                                #endif
599afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
600da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
601da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602aa372e3fSPaul Mullowney 
603da79fbbcSStefano Zampini         /* assign the pointer */
604aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6052cbc15d9SMark         upTriFactor->AA_h = AAUp;
60657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6084863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
609da79fbbcSStefano Zampini       } else {
6102cbc15d9SMark         if (!upTriFactor->AA_h) {
6112cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6122cbc15d9SMark         }
613da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
614da79fbbcSStefano Zampini         offset = nzUpper;
615da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
616da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
617da79fbbcSStefano Zampini 
618da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
619da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
620da79fbbcSStefano Zampini 
621da79fbbcSStefano Zampini           /* decrement the offset */
622da79fbbcSStefano Zampini           offset -= (nz+1);
623da79fbbcSStefano Zampini 
624da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6252cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6262cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
627da79fbbcSStefano Zampini         }
6282cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
629da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
630da79fbbcSStefano Zampini       }
6319ae82921SPaul Mullowney     } catch(char *ex) {
6329ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6339ae82921SPaul Mullowney     }
6349ae82921SPaul Mullowney   }
6359ae82921SPaul Mullowney   PetscFunctionReturn(0);
6369ae82921SPaul Mullowney }
6379ae82921SPaul Mullowney 
638087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6399ae82921SPaul Mullowney {
6409ae82921SPaul Mullowney   PetscErrorCode               ierr;
6419ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6429ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6439ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6449ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6459ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6469ae82921SPaul Mullowney 
6479ae82921SPaul Mullowney   PetscFunctionBegin;
648da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
649087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
650087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6512205254eSKarl Rupp 
652da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
653aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6549ae82921SPaul Mullowney 
655c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
656e057df02SPaul Mullowney   /* lower triangular indices */
6579ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
658da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
659da79fbbcSStefano Zampini     const PetscInt *r;
660da79fbbcSStefano Zampini 
661da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
662aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6649ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
665da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666da79fbbcSStefano Zampini   }
6679ae82921SPaul Mullowney 
668e057df02SPaul Mullowney   /* upper triangular indices */
6699ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
670da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
671da79fbbcSStefano Zampini     const PetscInt *c;
672da79fbbcSStefano Zampini 
673da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
674aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6769ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
677da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
678da79fbbcSStefano Zampini   }
6799ae82921SPaul Mullowney   PetscFunctionReturn(0);
6809ae82921SPaul Mullowney }
6819ae82921SPaul Mullowney 
682087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
683087f3262SPaul Mullowney {
684087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
685087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
686aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
687aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
688087f3262SPaul Mullowney   cusparseStatus_t                  stat;
689087f3262SPaul Mullowney   PetscErrorCode                    ierr;
69057d48284SJunchao Zhang   cudaError_t                       cerr;
691087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
692087f3262SPaul Mullowney   PetscScalar                       *AAUp;
693087f3262SPaul Mullowney   PetscScalar                       *AALo;
694087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
695087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
696087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
697087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
698087f3262SPaul Mullowney 
699087f3262SPaul Mullowney   PetscFunctionBegin;
700cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
701c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
702087f3262SPaul Mullowney     try {
703da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
704da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
706087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
70857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
709087f3262SPaul Mullowney 
710087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
711087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
712087f3262SPaul Mullowney         AiUp[n]=nzUpper;
713087f3262SPaul Mullowney         offset = 0;
714087f3262SPaul Mullowney         for (i=0; i<n; i++) {
715087f3262SPaul Mullowney           /* set the pointers */
716087f3262SPaul Mullowney           v  = aa + ai[i];
717087f3262SPaul Mullowney           vj = aj + ai[i];
718087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
719087f3262SPaul Mullowney 
720087f3262SPaul Mullowney           /* first, set the diagonal elements */
721087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
72209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
723087f3262SPaul Mullowney           AiUp[i]      = offset;
72409f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
725087f3262SPaul Mullowney 
726087f3262SPaul Mullowney           offset+=1;
727087f3262SPaul Mullowney           if (nz>0) {
728f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
729580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
730087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
731087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
732087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
733087f3262SPaul Mullowney             }
734087f3262SPaul Mullowney             offset+=nz;
735087f3262SPaul Mullowney           }
736087f3262SPaul Mullowney         }
737087f3262SPaul Mullowney 
738aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
739da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
740da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
741087f3262SPaul Mullowney 
742aa372e3fSPaul Mullowney         /* Create the matrix description */
74357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
74457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7451b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
746afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
747afb2bd1cSJunchao Zhang        #else
74857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
749afb2bd1cSJunchao Zhang        #endif
75057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
75157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
752087f3262SPaul Mullowney 
753aa372e3fSPaul Mullowney         /* set the matrix */
754aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
755aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
756aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
758aa372e3fSPaul Mullowney 
759aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
761aa372e3fSPaul Mullowney 
762aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
763aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
764aa372e3fSPaul Mullowney 
765aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
766aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
767aa372e3fSPaul Mullowney 
768afb2bd1cSJunchao Zhang         /* set the operation */
769afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
770afb2bd1cSJunchao Zhang 
771afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
772da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
773afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7741b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
775afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
776afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
777afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
778afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
779afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
780afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
781afb2bd1cSJunchao Zhang       #endif
782afb2bd1cSJunchao Zhang 
783aa372e3fSPaul Mullowney         /* perform the solve analysis */
784aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
785aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
786aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
787afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7881b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
789afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
790afb2bd1cSJunchao Zhang                                 #endif
791afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
792da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
793da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
794aa372e3fSPaul Mullowney 
795da79fbbcSStefano Zampini         /* assign the pointer */
796aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
797aa372e3fSPaul Mullowney 
798aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
799da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
800da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
801aa372e3fSPaul Mullowney 
802aa372e3fSPaul Mullowney         /* Create the matrix description */
80357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
80457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8051b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
807afb2bd1cSJunchao Zhang        #else
80857d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
809afb2bd1cSJunchao Zhang        #endif
81057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
81157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
812aa372e3fSPaul Mullowney 
813aa372e3fSPaul Mullowney         /* set the operation */
814aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
815aa372e3fSPaul Mullowney 
816aa372e3fSPaul Mullowney         /* set the matrix */
817aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
818aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
819aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
821aa372e3fSPaul Mullowney 
822aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
824aa372e3fSPaul Mullowney 
825aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
826aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
827aa372e3fSPaul Mullowney 
828aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
829aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
830aa372e3fSPaul Mullowney 
831afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
832da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
833afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8341b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
835afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
836afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
837afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
839afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
840afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
841afb2bd1cSJunchao Zhang       #endif
842afb2bd1cSJunchao Zhang 
843aa372e3fSPaul Mullowney         /* perform the solve analysis */
844aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
845aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
846aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
847afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8481b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
849afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
850afb2bd1cSJunchao Zhang                                 #endif
851afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
852da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
853da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854aa372e3fSPaul Mullowney 
855da79fbbcSStefano Zampini         /* assign the pointer */
856aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
857087f3262SPaul Mullowney 
858da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
85957d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
86057d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
861da79fbbcSStefano Zampini       } else {
862da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
863da79fbbcSStefano Zampini         offset = 0;
864da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
865da79fbbcSStefano Zampini           /* set the pointers */
866da79fbbcSStefano Zampini           v  = aa + ai[i];
867da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
868da79fbbcSStefano Zampini 
869da79fbbcSStefano Zampini           /* first, set the diagonal elements */
870da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
871da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
872da79fbbcSStefano Zampini 
873da79fbbcSStefano Zampini           offset+=1;
874da79fbbcSStefano Zampini           if (nz>0) {
875da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
876da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
877da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
878da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
879da79fbbcSStefano Zampini             }
880da79fbbcSStefano Zampini             offset+=nz;
881da79fbbcSStefano Zampini           }
882da79fbbcSStefano Zampini         }
883da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
884da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
886da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
887da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
888da79fbbcSStefano Zampini       }
88957d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
89057d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
891087f3262SPaul Mullowney     } catch(char *ex) {
892087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
893087f3262SPaul Mullowney     }
894087f3262SPaul Mullowney   }
895087f3262SPaul Mullowney   PetscFunctionReturn(0);
896087f3262SPaul Mullowney }
897087f3262SPaul Mullowney 
898087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8999ae82921SPaul Mullowney {
9009ae82921SPaul Mullowney   PetscErrorCode               ierr;
901087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
902087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
903087f3262SPaul Mullowney   IS                           ip = a->row;
904087f3262SPaul Mullowney   PetscBool                    perm_identity;
905087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
906087f3262SPaul Mullowney 
907087f3262SPaul Mullowney   PetscFunctionBegin;
908da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
910da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
911aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
912aa372e3fSPaul Mullowney 
913da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
914da79fbbcSStefano Zampini 
915087f3262SPaul Mullowney   /* lower triangular indices */
916087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
917087f3262SPaul Mullowney   if (!perm_identity) {
9184e4bbfaaSStefano Zampini     IS             iip;
919da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9204e4bbfaaSStefano Zampini 
9214e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9224e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
923da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
924aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
925aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
926aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9274e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9284e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9294e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
930087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
931da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
932da79fbbcSStefano Zampini   }
933087f3262SPaul Mullowney   PetscFunctionReturn(0);
934087f3262SPaul Mullowney }
935087f3262SPaul Mullowney 
9366fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9379ae82921SPaul Mullowney {
9389ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9399ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9409ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
941b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9429ae82921SPaul Mullowney 
9439ae82921SPaul Mullowney   PetscFunctionBegin;
94457181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9459ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
946ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
947e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9489ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9499ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
950bda325fcSPaul Mullowney   if (row_identity && col_identity) {
951bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
952bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9534e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9544e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
955bda325fcSPaul Mullowney   } else {
956bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
957bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9584e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9594e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
960bda325fcSPaul Mullowney   }
9618dc1d2a3SPaul Mullowney 
962e057df02SPaul Mullowney   /* get the triangular factors */
963087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9649ae82921SPaul Mullowney   PetscFunctionReturn(0);
9659ae82921SPaul Mullowney }
9669ae82921SPaul Mullowney 
967087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968087f3262SPaul Mullowney {
969087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970087f3262SPaul Mullowney   IS             ip = b->row;
971087f3262SPaul Mullowney   PetscBool      perm_identity;
972b175d8bbSPaul Mullowney   PetscErrorCode ierr;
973087f3262SPaul Mullowney 
974087f3262SPaul Mullowney   PetscFunctionBegin;
97557181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
978087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
979087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980087f3262SPaul Mullowney   if (perm_identity) {
981087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9834e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9844e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
985087f3262SPaul Mullowney   } else {
986087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9884e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9894e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
990087f3262SPaul Mullowney   }
991087f3262SPaul Mullowney 
992087f3262SPaul Mullowney   /* get the triangular factors */
993087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994087f3262SPaul Mullowney   PetscFunctionReturn(0);
995087f3262SPaul Mullowney }
9969ae82921SPaul Mullowney 
997b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998bda325fcSPaul Mullowney {
999bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1005aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1006aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1007aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1008aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10091b0a6780SStefano Zampini   cudaError_t                       cerr;
1010da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1011b175d8bbSPaul Mullowney 
1012bda325fcSPaul Mullowney   PetscFunctionBegin;
1013aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1014da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1018aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1019aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023aa372e3fSPaul Mullowney 
1024aa372e3fSPaul Mullowney   /* Create the matrix description */
102557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102757d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030aa372e3fSPaul Mullowney 
1031aa372e3fSPaul Mullowney   /* set the operation */
1032aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033aa372e3fSPaul Mullowney 
1034aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1035aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1036afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042aa372e3fSPaul Mullowney 
1043aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1048afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1049afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1050afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1051afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10541b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055afb2bd1cSJunchao Zhang #endif
1056afb2bd1cSJunchao Zhang 
1057da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1061aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1062aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1063aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1064afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068afb2bd1cSJunchao Zhang                         #else
1069afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1071afb2bd1cSJunchao Zhang                         #endif
1072afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1073da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075aa372e3fSPaul Mullowney 
1076afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1077da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086afb2bd1cSJunchao Zhang #endif
1087afb2bd1cSJunchao Zhang 
1088afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1089aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10931b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095afb2bd1cSJunchao Zhang                           #endif
1096afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1097da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099aa372e3fSPaul Mullowney 
1100da79fbbcSStefano Zampini   /* assign the pointer */
1101aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102aa372e3fSPaul Mullowney 
1103aa372e3fSPaul Mullowney   /*********************************************/
1104aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1105aa372e3fSPaul Mullowney   /*********************************************/
1106aa372e3fSPaul Mullowney 
1107aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1108da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1112aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1113aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117aa372e3fSPaul Mullowney 
1118aa372e3fSPaul Mullowney   /* Create the matrix description */
111957d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112057d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112157d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112257d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112357d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124aa372e3fSPaul Mullowney 
1125aa372e3fSPaul Mullowney   /* set the operation */
1126aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127aa372e3fSPaul Mullowney 
1128aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1129aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1130afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136aa372e3fSPaul Mullowney 
1137aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1142afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1143afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1144afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1145afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149afb2bd1cSJunchao Zhang #endif
1150afb2bd1cSJunchao Zhang 
1151da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1155aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1156aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1157aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1158afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162afb2bd1cSJunchao Zhang                         #else
1163afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1165afb2bd1cSJunchao Zhang                         #endif
1166afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1167da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169aa372e3fSPaul Mullowney 
1170afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1171da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11731b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180afb2bd1cSJunchao Zhang   #endif
1181afb2bd1cSJunchao Zhang 
1182afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1183aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11871b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189afb2bd1cSJunchao Zhang                           #endif
1190afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1191da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193aa372e3fSPaul Mullowney 
1194da79fbbcSStefano Zampini   /* assign the pointer */
1195aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1197bda325fcSPaul Mullowney }
1198bda325fcSPaul Mullowney 
1199a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1200a49f1ed0SStefano Zampini {
1201a49f1ed0SStefano Zampini   __host__ __device__
1202a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1203a49f1ed0SStefano Zampini   {
1204a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1205a49f1ed0SStefano Zampini   }
1206a49f1ed0SStefano Zampini };
1207a49f1ed0SStefano Zampini 
1208b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1209bda325fcSPaul Mullowney {
1210aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1214aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1215b06137fdSPaul Mullowney   cudaError_t                  err;
121685ba7357SStefano Zampini   PetscErrorCode               ierr;
1217b175d8bbSPaul Mullowney 
1218bda325fcSPaul Mullowney   PetscFunctionBegin;
1219a49f1ed0SStefano Zampini   if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1220a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1221a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222a49f1ed0SStefano Zampini   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224a49f1ed0SStefano Zampini   if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225a49f1ed0SStefano Zampini   if (cusparsestruct->transupdated) PetscFunctionReturn(0);
122685ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1227a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229a49f1ed0SStefano Zampini   }
1230a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123257d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123457d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
123557d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236aa372e3fSPaul Mullowney 
1237b06137fdSPaul Mullowney     /* set alpha and beta */
1238afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12397656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12407656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12427656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12437656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244b06137fdSPaul Mullowney 
1245aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1247a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1248554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1249554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1250aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1251a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1254a3fdcf43SKarl Rupp 
1255039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
125681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257afb2bd1cSJunchao Zhang 
1258afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1260afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1263afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265afb2bd1cSJunchao Zhang      #endif
1266aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269afb2bd1cSJunchao Zhang    #else
1270aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
127151c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
127251c6d536SStefano Zampini       /* First convert HYB to CSR */
1273aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1274aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1275aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1276aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1279aa372e3fSPaul Mullowney 
1280aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282aa372e3fSPaul Mullowney                               temp->values->data().get(),
1283aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
128457d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1285aa372e3fSPaul Mullowney 
1286aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1288aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1289aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1290aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1293aa372e3fSPaul Mullowney 
1294aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1296aa372e3fSPaul Mullowney                               temp->values->data().get(),
1297aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1298aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1299aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1300aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
130257d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1303aa372e3fSPaul Mullowney 
1304aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1305aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
130657d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1311aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1312aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
131357d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1314aa372e3fSPaul Mullowney 
1315aa372e3fSPaul Mullowney       /* assign the pointer */
1316aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
1317a49f1ed0SStefano Zampini       cusparsestruct->transupdated = PETSC_TRUE;
1318aa372e3fSPaul Mullowney       /* delete temporaries */
1319aa372e3fSPaul Mullowney       if (tempT) {
1320aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1324087f3262SPaul Mullowney       }
1325aa372e3fSPaul Mullowney       if (temp) {
1326aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1330aa372e3fSPaul Mullowney       }
1331afb2bd1cSJunchao Zhang      #endif
1332aa372e3fSPaul Mullowney     }
1333a49f1ed0SStefano Zampini   }
1334a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337a49f1ed0SStefano Zampini     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338a49f1ed0SStefano Zampini     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339a49f1ed0SStefano Zampini     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340a49f1ed0SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341a49f1ed0SStefano Zampini     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342a49f1ed0SStefano Zampini     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343a49f1ed0SStefano Zampini     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344a49f1ed0SStefano Zampini     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1349a49f1ed0SStefano Zampini     }
1350a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1351a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1353a49f1ed0SStefano Zampini 
1354a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1357a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1358a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1360a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1361a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1363a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1364a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368a49f1ed0SStefano Zampini      #endif
1369a49f1ed0SStefano Zampini 
1370a49f1ed0SStefano Zampini       stat = cusparse_csr2csc(cusparsestruct->handle,
1371a49f1ed0SStefano Zampini                               A->rmap->n,A->cmap->n,matrix->num_entries,
1372a49f1ed0SStefano Zampini                               csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(),
1373a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1374a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1375a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1376a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
1377a49f1ed0SStefano Zampini                               cusparsestruct->csr2cscAlg, csr2cscBuffer
1378a49f1ed0SStefano Zampini                              #else
1379a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1380a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC, indexBase
1381a49f1ed0SStefano Zampini                              #endif
1382a49f1ed0SStefano Zampini );CHKERRCUSPARSE(stat);
1383a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1384a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1385a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1386a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1387a49f1ed0SStefano Zampini      #endif
1388a49f1ed0SStefano Zampini     }
1389a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1390a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1391a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1392a49f1ed0SStefano Zampini   }
139385ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1394213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1395213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1396aa372e3fSPaul Mullowney   /* assign the pointer */
1397aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1398a49f1ed0SStefano Zampini   cusparsestruct->transupdated = PETSC_TRUE;
1399bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1400bda325fcSPaul Mullowney }
1401bda325fcSPaul Mullowney 
1402a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14036fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1404bda325fcSPaul Mullowney {
1405c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1406465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1407465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1408465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1409465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1410bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1411bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1412aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1413aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1414aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1415b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
141657d48284SJunchao Zhang   cudaError_t                           cerr;
1417bda325fcSPaul Mullowney 
1418bda325fcSPaul Mullowney   PetscFunctionBegin;
1419aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1420aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1421bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1422aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1423aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1424bda325fcSPaul Mullowney   }
1425bda325fcSPaul Mullowney 
1426bda325fcSPaul Mullowney   /* Get the GPU pointers */
1427c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1428c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1429c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1430c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1431bda325fcSPaul Mullowney 
14327a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1433aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1434a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1435c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1436c41cb2e2SAlejandro Lamas Daviña                xGPU);
1437aa372e3fSPaul Mullowney 
1438aa372e3fSPaul Mullowney   /* First, solve U */
1439aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1440afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14411b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1442afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1443afb2bd1cSJunchao Zhang                       #endif
1444afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1445aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1446aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1447aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1448aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1449afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14501b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1451afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1452afb2bd1cSJunchao Zhang                       #endif
1453afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1454aa372e3fSPaul Mullowney 
1455aa372e3fSPaul Mullowney   /* Then, solve L */
1456aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1457afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14581b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1459afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1460afb2bd1cSJunchao Zhang                       #endif
1461afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1462aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1463aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1464aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1465aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1466afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14671b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1468afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1469afb2bd1cSJunchao Zhang                       #endif
1470afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1471aa372e3fSPaul Mullowney 
1472aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1473a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1474c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1475aa372e3fSPaul Mullowney                tempGPU->begin());
1476aa372e3fSPaul Mullowney 
1477aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1478a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1479bda325fcSPaul Mullowney 
1480bda325fcSPaul Mullowney   /* restore */
1481c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1482c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
148305035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1484661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1485958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1486bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1487bda325fcSPaul Mullowney }
1488bda325fcSPaul Mullowney 
14896fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1490bda325fcSPaul Mullowney {
1491465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1492465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1493bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1494bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1495aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1496aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1497aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1498b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
149957d48284SJunchao Zhang   cudaError_t                       cerr;
1500bda325fcSPaul Mullowney 
1501bda325fcSPaul Mullowney   PetscFunctionBegin;
1502aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1503aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1504bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1505aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1506aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1507bda325fcSPaul Mullowney   }
1508bda325fcSPaul Mullowney 
1509bda325fcSPaul Mullowney   /* Get the GPU pointers */
1510c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1511c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1512bda325fcSPaul Mullowney 
15137a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1514aa372e3fSPaul Mullowney   /* First, solve U */
1515aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1516afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15171b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1518afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1519afb2bd1cSJunchao Zhang                       #endif
1520afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1521aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1522aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1523aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1524aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1525afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15261b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1527afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1528afb2bd1cSJunchao Zhang                       #endif
1529afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1530aa372e3fSPaul Mullowney 
1531aa372e3fSPaul Mullowney   /* Then, solve L */
1532aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1533afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15341b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1535afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1536afb2bd1cSJunchao Zhang                       #endif
1537afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1538aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1539aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1540aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1541aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1542afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15431b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1544afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1545afb2bd1cSJunchao Zhang                       #endif
1546afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1547bda325fcSPaul Mullowney 
1548bda325fcSPaul Mullowney   /* restore */
1549c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1550c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
155105035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1552661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1553958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1554bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1555bda325fcSPaul Mullowney }
1556bda325fcSPaul Mullowney 
15576fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15589ae82921SPaul Mullowney {
1559465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1560465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1561465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1562465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15639ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1565aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1566aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1567aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1568b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
156957d48284SJunchao Zhang   cudaError_t                           cerr;
15709ae82921SPaul Mullowney 
15719ae82921SPaul Mullowney   PetscFunctionBegin;
1572ebc8f436SDominic Meiser 
1573e057df02SPaul Mullowney   /* Get the GPU pointers */
1574c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1575c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1576c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1577c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15789ae82921SPaul Mullowney 
15797a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1580aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1581a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1582c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15834e4bbfaaSStefano Zampini                tempGPU->begin());
1584aa372e3fSPaul Mullowney 
1585aa372e3fSPaul Mullowney   /* Next, solve L */
1586aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1587afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15881b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1589afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1590afb2bd1cSJunchao Zhang                       #endif
1591afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1592aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1593aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1594aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1595aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1596afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15971b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1598afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1599afb2bd1cSJunchao Zhang                       #endif
1600afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1601aa372e3fSPaul Mullowney 
1602aa372e3fSPaul Mullowney   /* Then, solve U */
1603aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1604afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16051b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1606afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1607afb2bd1cSJunchao Zhang                       #endif
1608afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1609aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1610aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1611aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1612aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1613afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16141b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1616afb2bd1cSJunchao Zhang                       #endif
1617afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1618aa372e3fSPaul Mullowney 
16194e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1620a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16214e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16224e4bbfaaSStefano Zampini                xGPU);
16239ae82921SPaul Mullowney 
1624c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1625c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
162605035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1627661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1628958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16299ae82921SPaul Mullowney   PetscFunctionReturn(0);
16309ae82921SPaul Mullowney }
16319ae82921SPaul Mullowney 
16326fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16339ae82921SPaul Mullowney {
1634465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1635465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16369ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16379ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1638aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1639aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1640aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1641b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
164257d48284SJunchao Zhang   cudaError_t                       cerr;
16439ae82921SPaul Mullowney 
16449ae82921SPaul Mullowney   PetscFunctionBegin;
1645e057df02SPaul Mullowney   /* Get the GPU pointers */
1646c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1647c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16489ae82921SPaul Mullowney 
16497a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1650aa372e3fSPaul Mullowney   /* First, solve L */
1651aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1652afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1654afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1655afb2bd1cSJunchao Zhang                       #endif
1656afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1657aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1658aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1659aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1660aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1661afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16621b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1663afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1664afb2bd1cSJunchao Zhang                       #endif
1665afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1666aa372e3fSPaul Mullowney 
1667aa372e3fSPaul Mullowney   /* Next, solve U */
1668aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1669afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1671afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1672afb2bd1cSJunchao Zhang                       #endif
1673afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1674aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1675aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1676aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1677aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1678afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16791b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1680afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1681afb2bd1cSJunchao Zhang                       #endif
1682afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16839ae82921SPaul Mullowney 
1684c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1685c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
168605035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1687661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1688958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16899ae82921SPaul Mullowney   PetscFunctionReturn(0);
16909ae82921SPaul Mullowney }
16919ae82921SPaul Mullowney 
16927e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16937e8381f9SStefano Zampini {
16947e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16957e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16967e8381f9SStefano Zampini   cudaError_t        cerr;
16977e8381f9SStefano Zampini   PetscErrorCode     ierr;
16987e8381f9SStefano Zampini 
16997e8381f9SStefano Zampini   PetscFunctionBegin;
17007e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17017e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17027e8381f9SStefano Zampini 
17037e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17047e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17057e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17067e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17077e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17087e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17097e8381f9SStefano Zampini   }
17107e8381f9SStefano Zampini   PetscFunctionReturn(0);
17117e8381f9SStefano Zampini }
17127e8381f9SStefano Zampini 
17137e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17147e8381f9SStefano Zampini {
17157e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17167e8381f9SStefano Zampini   PetscErrorCode ierr;
17177e8381f9SStefano Zampini 
17187e8381f9SStefano Zampini   PetscFunctionBegin;
17197e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17207e8381f9SStefano Zampini   *array = a->a;
17217e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17227e8381f9SStefano Zampini   PetscFunctionReturn(0);
17237e8381f9SStefano Zampini }
17247e8381f9SStefano Zampini 
17256fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17269ae82921SPaul Mullowney {
1727aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17287c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1730213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17319ae82921SPaul Mullowney   PetscErrorCode               ierr;
1732aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1733abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1734b06137fdSPaul Mullowney   cudaError_t                  err;
17359ae82921SPaul Mullowney 
17369ae82921SPaul Mullowney   PetscFunctionBegin;
1737fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1738c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1739a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1740a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1741afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
174285ba7357SStefano Zampini 
1743abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
174485ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1745afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
174605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17474863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
174885ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1749a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
175034d6c7a5SJose E. Roman     } else {
1751abb89eb1SStefano Zampini       PetscInt nnz;
175285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17537c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1754a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17557c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
175681902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1757a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1758a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17599ae82921SPaul Mullowney       try {
17609ae82921SPaul Mullowney         if (a->compressedrow.use) {
17619ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17629ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17639ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17649ae82921SPaul Mullowney         } else {
1765213423ffSJunchao Zhang           m    = A->rmap->n;
1766213423ffSJunchao Zhang           ii   = a->i;
1767e6e9a74fSStefano Zampini           ridx = NULL;
17689ae82921SPaul Mullowney         }
1769abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1770abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1771abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1772abb89eb1SStefano Zampini         else nnz = a->nz;
17739ae82921SPaul Mullowney 
177485ba7357SStefano Zampini         /* create cusparse matrix */
1775abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1776aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
177757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
177857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
177957d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17809ae82921SPaul Mullowney 
1781afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17827656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17837656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1784afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17857656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17867656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
178757d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1788b06137fdSPaul Mullowney 
1789aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1790aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1791aa372e3fSPaul Mullowney           /* set the matrix */
1792afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1793afb2bd1cSJunchao Zhang           mat->num_rows = m;
1794afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1795abb89eb1SStefano Zampini           mat->num_entries = nnz;
1796afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1797afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17989ae82921SPaul Mullowney 
1799abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1800abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1801aa372e3fSPaul Mullowney 
1802abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1803abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1804aa372e3fSPaul Mullowney 
1805aa372e3fSPaul Mullowney           /* assign the pointer */
1806afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1807afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1808afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1809afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1810afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1811afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1812afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1813afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1814afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1815afb2bd1cSJunchao Zhang           }
1816afb2bd1cSJunchao Zhang          #endif
1817aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1818afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1819afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1820afb2bd1cSJunchao Zhang          #else
1821afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1822afb2bd1cSJunchao Zhang           mat->num_rows = m;
1823afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1824abb89eb1SStefano Zampini           mat->num_entries = nnz;
1825afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1826afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1827aa372e3fSPaul Mullowney 
1828abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1829abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1830aa372e3fSPaul Mullowney 
1831abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1832abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1833aa372e3fSPaul Mullowney 
1834aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
183557d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1836aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1837aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1838afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1839afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1840afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1841afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
184257d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1843aa372e3fSPaul Mullowney           /* assign the pointer */
1844aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1845aa372e3fSPaul Mullowney 
1846afb2bd1cSJunchao Zhang           if (mat) {
1847afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1848afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1849afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1850afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1851087f3262SPaul Mullowney           }
1852afb2bd1cSJunchao Zhang          #endif
1853087f3262SPaul Mullowney         }
1854ca45077fSPaul Mullowney 
1855aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1856213423ffSJunchao Zhang         if (a->compressedrow.use) {
1857213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1858aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1859aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1860213423ffSJunchao Zhang           tmp = m;
1861213423ffSJunchao Zhang         } else {
1862213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1863213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1864213423ffSJunchao Zhang           tmp = 0;
1865213423ffSJunchao Zhang         }
1866213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1867aa372e3fSPaul Mullowney 
1868aa372e3fSPaul Mullowney         /* assign the pointer */
1869aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18709ae82921SPaul Mullowney       } catch(char *ex) {
18719ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18729ae82921SPaul Mullowney       }
187305035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
187485ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
187534d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
187634d6c7a5SJose E. Roman     }
1877abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18789ae82921SPaul Mullowney   }
18799ae82921SPaul Mullowney   PetscFunctionReturn(0);
18809ae82921SPaul Mullowney }
18819ae82921SPaul Mullowney 
1882c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1883aa372e3fSPaul Mullowney {
1884aa372e3fSPaul Mullowney   template <typename Tuple>
1885aa372e3fSPaul Mullowney   __host__ __device__
1886aa372e3fSPaul Mullowney   void operator()(Tuple t)
1887aa372e3fSPaul Mullowney   {
1888aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1889aa372e3fSPaul Mullowney   }
1890aa372e3fSPaul Mullowney };
1891aa372e3fSPaul Mullowney 
18927e8381f9SStefano Zampini struct VecCUDAEquals
18937e8381f9SStefano Zampini {
18947e8381f9SStefano Zampini   template <typename Tuple>
18957e8381f9SStefano Zampini   __host__ __device__
18967e8381f9SStefano Zampini   void operator()(Tuple t)
18977e8381f9SStefano Zampini   {
18987e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18997e8381f9SStefano Zampini   }
19007e8381f9SStefano Zampini };
19017e8381f9SStefano Zampini 
1902e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1903e6e9a74fSStefano Zampini {
1904e6e9a74fSStefano Zampini   template <typename Tuple>
1905e6e9a74fSStefano Zampini   __host__ __device__
1906e6e9a74fSStefano Zampini   void operator()(Tuple t)
1907e6e9a74fSStefano Zampini   {
1908e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1909e6e9a74fSStefano Zampini   }
1910e6e9a74fSStefano Zampini };
1911e6e9a74fSStefano Zampini 
1912afb2bd1cSJunchao Zhang struct MatMatCusparse {
1913ccdfe979SStefano Zampini   PetscBool             cisdense;
1914ccdfe979SStefano Zampini   PetscScalar           *Bt;
1915ccdfe979SStefano Zampini   Mat                   X;
1916fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1917fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1918fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1919afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1920fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1921afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1922afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1923afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1924afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1925fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1926fcdce8c4SStefano Zampini   void                  *mmBuffer;
1927fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1928fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1929afb2bd1cSJunchao Zhang #endif
1930afb2bd1cSJunchao Zhang };
1931ccdfe979SStefano Zampini 
1932ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1933ccdfe979SStefano Zampini {
1934ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1935ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1936ccdfe979SStefano Zampini   cudaError_t      cerr;
1937fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1939fcdce8c4SStefano Zampini  #endif
1940ccdfe979SStefano Zampini 
1941ccdfe979SStefano Zampini   PetscFunctionBegin;
1942ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1943fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1944afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1945fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1946fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1947fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1948afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1949afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1950fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1951afb2bd1cSJunchao Zhang  #endif
1952ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1953ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1954ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1955ccdfe979SStefano Zampini }
1956ccdfe979SStefano Zampini 
1957ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1958ccdfe979SStefano Zampini 
1959ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1960ccdfe979SStefano Zampini {
1961ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1962ccdfe979SStefano Zampini   Mat                          A,B;
1963afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1964ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1965ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1966ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1967ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1968ccdfe979SStefano Zampini   const PetscScalar            *barray;
1969ccdfe979SStefano Zampini   PetscScalar                  *carray;
1970ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1971ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1972ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1973ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1974afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1975ccdfe979SStefano Zampini 
1976ccdfe979SStefano Zampini   PetscFunctionBegin;
1977ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1978ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1979ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1980ccdfe979SStefano Zampini   A    = product->A;
1981ccdfe979SStefano Zampini   B    = product->B;
1982ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1983ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1984ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1985ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1986ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1987ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1988ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1989ccdfe979SStefano Zampini   switch (product->type) {
1990ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1991ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1992ccdfe979SStefano Zampini     mat = cusp->mat;
1993ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1994ccdfe979SStefano Zampini     m   = A->rmap->n;
1995ccdfe979SStefano Zampini     n   = B->cmap->n;
1996ccdfe979SStefano Zampini     break;
1997ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1998e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1999e6e9a74fSStefano Zampini       mat = cusp->mat;
2000e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2001e6e9a74fSStefano Zampini     } else {
2002ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2003ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2004ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2005e6e9a74fSStefano Zampini     }
2006ccdfe979SStefano Zampini     m = A->cmap->n;
2007ccdfe979SStefano Zampini     n = B->cmap->n;
2008ccdfe979SStefano Zampini     break;
2009ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2010ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2011ccdfe979SStefano Zampini     mat = cusp->mat;
2012ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2013ccdfe979SStefano Zampini     m   = A->rmap->n;
2014ccdfe979SStefano Zampini     n   = B->rmap->n;
2015ccdfe979SStefano Zampini     break;
2016ccdfe979SStefano Zampini   default:
2017ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2018ccdfe979SStefano Zampini   }
2019ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2020ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2021ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2022ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2023afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2024ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2025afb2bd1cSJunchao Zhang 
2026ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2027c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2028c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2029c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2030c8378d12SStefano Zampini   } else {
2031c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2032c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2033c8378d12SStefano Zampini   }
2034c8378d12SStefano Zampini 
2035c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2036afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2037afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2038fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2039afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2040fcdce8c4SStefano Zampini     size_t mmBufferSize;
2041afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2042afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2043afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2044afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2045afb2bd1cSJunchao Zhang     }
2046c8378d12SStefano Zampini 
2047afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2048afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2049afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2050afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2051afb2bd1cSJunchao Zhang     }
2052afb2bd1cSJunchao Zhang 
2053afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2054afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2055afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2056afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2057afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2058afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2059afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2060afb2bd1cSJunchao Zhang     }
2061afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2062afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2063afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2064fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2065fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2066fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2067fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2068fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2069fcdce8c4SStefano Zampini     }
2070afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2071afb2bd1cSJunchao Zhang   } else {
2072afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2073afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2074afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2075afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2076afb2bd1cSJunchao Zhang   }
2077afb2bd1cSJunchao Zhang 
2078afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2079afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2080afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2081afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2082fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2083afb2bd1cSJunchao Zhang  #else
2084afb2bd1cSJunchao Zhang   PetscInt k;
2085afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2086ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2087ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2088ccdfe979SStefano Zampini     cublasStatus_t cerr;
2089ccdfe979SStefano Zampini 
2090ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2091ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2092ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2093ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2094ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2095ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2096ccdfe979SStefano Zampini     blda = B->cmap->n;
2097afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2098afb2bd1cSJunchao Zhang   } else {
2099afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2100ccdfe979SStefano Zampini   }
2101ccdfe979SStefano Zampini 
2102afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2103ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2104afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2105ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2106ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2107ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2108ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2109ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2110afb2bd1cSJunchao Zhang  #endif
2111afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2112c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2113c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2114ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2115ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2116ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2117ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2118ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2119ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2120ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2121ccdfe979SStefano Zampini   } else {
2122ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2123ccdfe979SStefano Zampini   }
2124ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2125ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2126ccdfe979SStefano Zampini   }
2127ccdfe979SStefano Zampini   if (!biscuda) {
2128ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2129ccdfe979SStefano Zampini   }
2130ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2131ccdfe979SStefano Zampini }
2132ccdfe979SStefano Zampini 
2133ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2134ccdfe979SStefano Zampini {
2135ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2136ccdfe979SStefano Zampini   Mat                A,B;
2137ccdfe979SStefano Zampini   PetscInt           m,n;
2138ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2139ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2140ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2141ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2142ccdfe979SStefano Zampini 
2143ccdfe979SStefano Zampini   PetscFunctionBegin;
2144ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2145ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2146ccdfe979SStefano Zampini   A    = product->A;
2147ccdfe979SStefano Zampini   B    = product->B;
2148ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2149ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2150ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2151ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2152ccdfe979SStefano Zampini   switch (product->type) {
2153ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2154ccdfe979SStefano Zampini     m = A->rmap->n;
2155ccdfe979SStefano Zampini     n = B->cmap->n;
2156ccdfe979SStefano Zampini     break;
2157ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2158ccdfe979SStefano Zampini     m = A->cmap->n;
2159ccdfe979SStefano Zampini     n = B->cmap->n;
2160ccdfe979SStefano Zampini     break;
2161ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2162ccdfe979SStefano Zampini     m = A->rmap->n;
2163ccdfe979SStefano Zampini     n = B->rmap->n;
2164ccdfe979SStefano Zampini     break;
2165ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2166ccdfe979SStefano Zampini     m = B->cmap->n;
2167ccdfe979SStefano Zampini     n = B->cmap->n;
2168ccdfe979SStefano Zampini     break;
2169ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2170ccdfe979SStefano Zampini     m = B->rmap->n;
2171ccdfe979SStefano Zampini     n = B->rmap->n;
2172ccdfe979SStefano Zampini     break;
2173ccdfe979SStefano Zampini   default:
2174ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2175ccdfe979SStefano Zampini   }
2176ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2177ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2178ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2179ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2180ccdfe979SStefano Zampini 
2181ccdfe979SStefano Zampini   /* product data */
2182ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2183ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2184afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2185afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2186ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2187afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2188ccdfe979SStefano Zampini   }
2189afb2bd1cSJunchao Zhang  #endif
2190ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2191ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2192ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2193ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2194ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2195ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2196ccdfe979SStefano Zampini     } else {
2197ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2198ccdfe979SStefano Zampini     }
2199ccdfe979SStefano Zampini   }
2200ccdfe979SStefano Zampini   C->product->data    = mmdata;
2201ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2202ccdfe979SStefano Zampini 
2203ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2204ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2205ccdfe979SStefano Zampini }
2206ccdfe979SStefano Zampini 
2207fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2208ccdfe979SStefano Zampini {
2209ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2210fcdce8c4SStefano Zampini   Mat                          A,B;
2211fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2212fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2213fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2214fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2215fcdce8c4SStefano Zampini   PetscBool                    flg;
2216ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2217fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2218fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2219fcdce8c4SStefano Zampini   MatProductType               ptype;
2220fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2221fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2222fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2223fcdce8c4SStefano Zampini #endif
2224ccdfe979SStefano Zampini 
2225ccdfe979SStefano Zampini   PetscFunctionBegin;
2226ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2227fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2228fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2229fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2230fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2231fcdce8c4SStefano Zampini   A = product->A;
2232fcdce8c4SStefano Zampini   B = product->B;
2233fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2234fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2235fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2236fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2237fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2238fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2239fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2240fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2241fcdce8c4SStefano Zampini     goto finalize;
2242fcdce8c4SStefano Zampini   }
2243fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2244fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2245fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2246fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2248fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2249fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2250fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2251fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2252fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2253fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2254fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2255fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2256fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2257fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2258fcdce8c4SStefano Zampini 
2259fcdce8c4SStefano Zampini   ptype = product->type;
2260fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2261fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2262fcdce8c4SStefano Zampini   switch (ptype) {
2263fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2264fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2265fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2266fcdce8c4SStefano Zampini     break;
2267fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2268fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2269fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2270fcdce8c4SStefano Zampini     break;
2271fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2272fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2273fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2274fcdce8c4SStefano Zampini     break;
2275fcdce8c4SStefano Zampini   default:
2276fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2277fcdce8c4SStefano Zampini   }
2278fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2279fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2280fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2281fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2282fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2283fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2284fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2285fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2286fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2287fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2288fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2289fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2290fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2291fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2292fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2293fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2294fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2295fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2296fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2297fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2298fcdce8c4SStefano Zampini #else
2299fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2300fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2301fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2302fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2303fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2304fcdce8c4SStefano Zampini #endif
2305fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2306fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2307fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2308fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2309fcdce8c4SStefano Zampini finalize:
2310fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2311fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2312fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2313fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2314fcdce8c4SStefano Zampini   c->reallocs         = 0;
2315fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2316fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2317fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2318fcdce8c4SStefano Zampini   C->num_ass++;
2319ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2320ccdfe979SStefano Zampini }
2321fcdce8c4SStefano Zampini 
2322fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2323fcdce8c4SStefano Zampini {
2324fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2325fcdce8c4SStefano Zampini   Mat                          A,B;
2326fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2327fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2328fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2329fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2330fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2331fcdce8c4SStefano Zampini   PetscBool                    flg;
2332fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2333fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2334fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2335fcdce8c4SStefano Zampini   MatProductType               ptype;
2336fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2337fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2338fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2339fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2340fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2341fcdce8c4SStefano Zampini   size_t                       bufSize2;
2342fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2343fcdce8c4SStefano Zampini #else
2344fcdce8c4SStefano Zampini   int                          cnz;
2345fcdce8c4SStefano Zampini #endif
2346fcdce8c4SStefano Zampini 
2347fcdce8c4SStefano Zampini   PetscFunctionBegin;
2348fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2349fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2350fcdce8c4SStefano Zampini   A    = product->A;
2351fcdce8c4SStefano Zampini   B    = product->B;
2352fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2353fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2354fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2355fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2356fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2357fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2358fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2359fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2360fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2361fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2362fcdce8c4SStefano Zampini 
2363fcdce8c4SStefano Zampini   /* product data */
2364fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2365fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2366fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2367fcdce8c4SStefano Zampini 
2368fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2369fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2370fcdce8c4SStefano Zampini   ptype = product->type;
2371fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2372fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2373fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2374fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2375fcdce8c4SStefano Zampini   switch (ptype) {
2376fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2377fcdce8c4SStefano Zampini     m = A->rmap->n;
2378fcdce8c4SStefano Zampini     n = B->cmap->n;
2379fcdce8c4SStefano Zampini     k = A->cmap->n;
2380fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2381fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2382fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2383fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2384fcdce8c4SStefano Zampini     break;
2385fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2386fcdce8c4SStefano Zampini     m = A->cmap->n;
2387fcdce8c4SStefano Zampini     n = B->cmap->n;
2388fcdce8c4SStefano Zampini     k = A->rmap->n;
2389fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2390fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2391fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2392fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2393fcdce8c4SStefano Zampini     break;
2394fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2395fcdce8c4SStefano Zampini     m = A->rmap->n;
2396fcdce8c4SStefano Zampini     n = B->rmap->n;
2397fcdce8c4SStefano Zampini     k = A->cmap->n;
2398fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2399fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2400fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2401fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2402fcdce8c4SStefano Zampini     break;
2403fcdce8c4SStefano Zampini   default:
2404fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2405fcdce8c4SStefano Zampini   }
2406fcdce8c4SStefano Zampini 
2407fcdce8c4SStefano Zampini   /* create cusparse matrix */
2408fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2409fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2410fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2411fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2412fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2413fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2414fcdce8c4SStefano Zampini 
2415fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2416fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2417fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2418fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2419fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2420fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2421fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2422fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2423fcdce8c4SStefano Zampini   } else {
2424fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2425fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2426fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2427fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2428fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2429fcdce8c4SStefano Zampini   }
2430fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2431fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2432fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2433fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2434fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2435fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2436fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2437fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2438fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2439fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2440fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2441fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2442fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2443fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2444fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2445fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2446fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2447fcdce8c4SStefano Zampini     c->nz = 0;
2448fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2449fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2450fcdce8c4SStefano Zampini     goto finalizesym;
2451fcdce8c4SStefano Zampini   }
2452fcdce8c4SStefano Zampini 
2453fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2454fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2455fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2456fcdce8c4SStefano Zampini   if (!biscompressed) {
2457fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2458fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2459fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2460fcdce8c4SStefano Zampini #endif
2461fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2462fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2463fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2464fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2465fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2466fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2467fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2468fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2469fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2470fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2471fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2472fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2473fcdce8c4SStefano Zampini     }
2474fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2475fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2476fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2477fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2478fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2479fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2480fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2481fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2482fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2483fcdce8c4SStefano Zampini     }
2484fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2485fcdce8c4SStefano Zampini #endif
2486fcdce8c4SStefano Zampini   }
2487fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2488fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2489fcdce8c4SStefano Zampini   /* precompute flops count */
2490fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2491fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2492fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2493fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2494fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2495fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2496fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2497fcdce8c4SStefano Zampini       }
2498fcdce8c4SStefano Zampini     }
2499fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2500fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2501fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2502fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2503fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2504fcdce8c4SStefano Zampini     }
2505fcdce8c4SStefano Zampini   } else { /* TODO */
2506fcdce8c4SStefano Zampini     flops = 0.;
2507fcdce8c4SStefano Zampini   }
2508fcdce8c4SStefano Zampini 
2509fcdce8c4SStefano Zampini   mmdata->flops = flops;
2510fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2511fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2512fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2513fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2514fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2515fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2516fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2517fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2518fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2519fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2520fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2521fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2522fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2523bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2524fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2525fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2526fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2527fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2528fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2529fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2530fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2531fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2532fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2533fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2534fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2535fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2536fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2537fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2538fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2539bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2540fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2541fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2542fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2543fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2544fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2545fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2546fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2547fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
254800702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2549fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2550fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2551fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2552fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2553fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2554fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2555fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2556fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2557fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2558fcdce8c4SStefano Zampini #else
2559fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2560fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2561fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2562fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2563fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2564fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2565fcdce8c4SStefano Zampini   c->nz = cnz;
2566fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2567fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2568fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2569fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2570fcdce8c4SStefano Zampini 
2571fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2572fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2573fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2574fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2575fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2576fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2577fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2578fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2579fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2580fcdce8c4SStefano Zampini #endif
2581fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2582fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2583fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2584fcdce8c4SStefano Zampini finalizesym:
2585fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2586fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2587fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2588fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2589fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2590fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2591fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2592fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2593fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2594fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2595fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2596fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2597fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2598fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2599fcdce8c4SStefano Zampini   } else {
2600fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2601fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2602fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2603fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2604fcdce8c4SStefano Zampini   }
2605fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2606fcdce8c4SStefano Zampini     PetscInt r = 0;
2607fcdce8c4SStefano Zampini     c->i[0] = 0;
2608fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2609fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2610fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2611fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2612fcdce8c4SStefano Zampini     }
2613fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2614fcdce8c4SStefano Zampini   }
2615fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2616fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2617fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2618fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2619fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2620fcdce8c4SStefano Zampini   c->rmax = 0;
2621fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2622fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2623fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2624fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2625fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2626fcdce8c4SStefano Zampini   }
2627fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2628fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2629fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2630fcdce8c4SStefano Zampini 
2631fcdce8c4SStefano Zampini   C->nonzerostate++;
2632fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2633fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2634fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2635fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2636fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2637fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2638fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2639abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2640fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2641fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2642fcdce8c4SStefano Zampini   }
2643fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2644fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2645fcdce8c4SStefano Zampini }
2646fcdce8c4SStefano Zampini 
2647fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2648fcdce8c4SStefano Zampini 
2649fcdce8c4SStefano Zampini /* handles sparse or dense B */
2650fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2651fcdce8c4SStefano Zampini {
2652fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2653fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2654fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2655fcdce8c4SStefano Zampini 
2656fcdce8c4SStefano Zampini   PetscFunctionBegin;
2657fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2658fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2659abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2660fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2661fcdce8c4SStefano Zampini   }
2662fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2663fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2664fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2665fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2666fcdce8c4SStefano Zampini     }
2667fcdce8c4SStefano Zampini   }
2668fcdce8c4SStefano Zampini   if (isdense) {
2669ccdfe979SStefano Zampini     switch (product->type) {
2670ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2671ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2672ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2673ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2674ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2675fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2676fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2677fcdce8c4SStefano Zampini       } else {
2678fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2679fcdce8c4SStefano Zampini       }
2680fcdce8c4SStefano Zampini       break;
2681fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2682fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2683fcdce8c4SStefano Zampini       break;
2684ccdfe979SStefano Zampini     default:
2685ccdfe979SStefano Zampini       break;
2686ccdfe979SStefano Zampini     }
2687fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2688fcdce8c4SStefano Zampini     switch (product->type) {
2689fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2690fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2691fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2692fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2693fcdce8c4SStefano Zampini       break;
2694fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2695fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2696fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2697fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2698fcdce8c4SStefano Zampini       break;
2699fcdce8c4SStefano Zampini     default:
2700fcdce8c4SStefano Zampini       break;
2701fcdce8c4SStefano Zampini     }
2702fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2703fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2704fcdce8c4SStefano Zampini   }
2705ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2706ccdfe979SStefano Zampini }
2707ccdfe979SStefano Zampini 
27086fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27099ae82921SPaul Mullowney {
2710b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27119ae82921SPaul Mullowney 
27129ae82921SPaul Mullowney   PetscFunctionBegin;
2713e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2714e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2715e6e9a74fSStefano Zampini }
2716e6e9a74fSStefano Zampini 
2717e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2718e6e9a74fSStefano Zampini {
2719e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2720e6e9a74fSStefano Zampini 
2721e6e9a74fSStefano Zampini   PetscFunctionBegin;
2722e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2723e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2724e6e9a74fSStefano Zampini }
2725e6e9a74fSStefano Zampini 
2726e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2727e6e9a74fSStefano Zampini {
2728e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2729e6e9a74fSStefano Zampini 
2730e6e9a74fSStefano Zampini   PetscFunctionBegin;
2731e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2732e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2733e6e9a74fSStefano Zampini }
2734e6e9a74fSStefano Zampini 
2735e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2736e6e9a74fSStefano Zampini {
2737e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2738e6e9a74fSStefano Zampini 
2739e6e9a74fSStefano Zampini   PetscFunctionBegin;
2740e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27419ae82921SPaul Mullowney   PetscFunctionReturn(0);
27429ae82921SPaul Mullowney }
27439ae82921SPaul Mullowney 
27446fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2745ca45077fSPaul Mullowney {
2746b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2747ca45077fSPaul Mullowney 
2748ca45077fSPaul Mullowney   PetscFunctionBegin;
2749e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2750ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2751ca45077fSPaul Mullowney }
2752ca45077fSPaul Mullowney 
2753a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2754a0e72f99SJunchao Zhang {
2755a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2756a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2757a0e72f99SJunchao Zhang }
2758a0e72f99SJunchao Zhang 
2759afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2760e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27619ae82921SPaul Mullowney {
27629ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2763aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27649ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2765e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2766b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
276757d48284SJunchao Zhang   cudaError_t                  cerr;
2768aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2769e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2770e6e9a74fSStefano Zampini   PetscBool                    compressed;
2771afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2772afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2773afb2bd1cSJunchao Zhang #endif
27746e111a19SKarl Rupp 
27759ae82921SPaul Mullowney   PetscFunctionBegin;
2776e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2777e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2778afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2779d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2780e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2781e6e9a74fSStefano Zampini   }
278234d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
278334d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2784e6e9a74fSStefano Zampini   if (!trans) {
27859ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2786c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2787e6e9a74fSStefano Zampini   } else {
2788e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2789e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2790e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2791e6e9a74fSStefano Zampini     } else {
2792afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2793e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2794e6e9a74fSStefano Zampini     }
2795e6e9a74fSStefano Zampini   }
2796e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2797e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2798213423ffSJunchao Zhang 
2799e6e9a74fSStefano Zampini   try {
2800e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2801213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2802213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2803afb2bd1cSJunchao Zhang 
280485ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2805e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2806afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2807afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2808afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2809afb2bd1cSJunchao Zhang       */
2810e6e9a74fSStefano Zampini       xptr = xarray;
2811afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2812213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2813afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2814afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2815afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2816afb2bd1cSJunchao Zhang        */
2817afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2818afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2819afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2820afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2821afb2bd1cSJunchao Zhang       }
2822afb2bd1cSJunchao Zhang      #endif
2823e6e9a74fSStefano Zampini     } else {
2824afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2825afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2826afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2827afb2bd1cSJunchao Zhang        */
2828afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2829e6e9a74fSStefano Zampini       dptr = zarray;
2830e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2831afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2832e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2833a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2834e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2835e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2836e6e9a74fSStefano Zampini       }
2837afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2838afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2839afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2840afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2841afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2842afb2bd1cSJunchao Zhang       }
2843afb2bd1cSJunchao Zhang      #endif
2844e6e9a74fSStefano Zampini     }
28459ae82921SPaul Mullowney 
2846afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2847aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2848afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2850afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2851afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2852afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2853afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2854afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2855afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2856afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2857afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2858afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2859afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2860afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2861afb2bd1cSJunchao Zhang 
2862afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2863afb2bd1cSJunchao Zhang       } else {
2864afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2865afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2866afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2867afb2bd1cSJunchao Zhang       }
2868afb2bd1cSJunchao Zhang 
2869afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2870afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2871afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2872afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2873afb2bd1cSJunchao Zhang                                beta,
2874afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2875afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2876afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2877afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2878afb2bd1cSJunchao Zhang      #else
28797656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2880e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2881a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2882afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2883aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2884e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
288557d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2886afb2bd1cSJunchao Zhang      #endif
2887aa372e3fSPaul Mullowney     } else {
2888213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2889afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2890afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2891afb2bd1cSJunchao Zhang        #else
2892301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2893e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2894afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2895e6e9a74fSStefano Zampini                                  xptr, beta,
289657d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2897afb2bd1cSJunchao Zhang        #endif
2898a65300a6SPaul Mullowney       }
2899aa372e3fSPaul Mullowney     }
290005035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2901958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2902aa372e3fSPaul Mullowney 
2903e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2904213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2905213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2906213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2907e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2908213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29097656d835SStefano Zampini         }
2910213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2911c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29127656d835SStefano Zampini       }
29137656d835SStefano Zampini 
2914213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2915213423ffSJunchao Zhang       if (compressed) {
2916e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2917a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2918a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2919a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
2920a0e72f99SJunchao Zhang          */
2921a0e72f99SJunchao Zhang        #if 0
2922a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2923a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2924a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2925e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2926c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
2927a0e72f99SJunchao Zhang        #else
2928a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
2929a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2930a0e72f99SJunchao Zhang        #endif
293105035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2932958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2933e6e9a74fSStefano Zampini       }
2934e6e9a74fSStefano Zampini     } else {
2935e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2936e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2937e6e9a74fSStefano Zampini       }
2938e6e9a74fSStefano Zampini     }
2939e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2940213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2941213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29429ae82921SPaul Mullowney   } catch(char *ex) {
29439ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29449ae82921SPaul Mullowney   }
2945e6e9a74fSStefano Zampini   if (yy) {
2946958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2947e6e9a74fSStefano Zampini   } else {
2948e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2949e6e9a74fSStefano Zampini   }
29509ae82921SPaul Mullowney   PetscFunctionReturn(0);
29519ae82921SPaul Mullowney }
29529ae82921SPaul Mullowney 
29536fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2954ca45077fSPaul Mullowney {
2955b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29566e111a19SKarl Rupp 
2957ca45077fSPaul Mullowney   PetscFunctionBegin;
2958e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2959ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2960ca45077fSPaul Mullowney }
2961ca45077fSPaul Mullowney 
29626fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29639ae82921SPaul Mullowney {
29649ae82921SPaul Mullowney   PetscErrorCode              ierr;
2965a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29669ae82921SPaul Mullowney   PetscFunctionBegin;
2967bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29683fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2969bc3f50f2SPaul Mullowney   }
29703fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29713fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2972a587d139SMark   if (d_mat) {
29733fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29743fa6b06aSMark Adams   }
29753fa6b06aSMark Adams 
29769ae82921SPaul Mullowney   PetscFunctionReturn(0);
29779ae82921SPaul Mullowney }
29789ae82921SPaul Mullowney 
29799ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2980e057df02SPaul Mullowney /*@
29819ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2982e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2983e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2984e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2985e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2986e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29879ae82921SPaul Mullowney 
2988d083f849SBarry Smith    Collective
29899ae82921SPaul Mullowney 
29909ae82921SPaul Mullowney    Input Parameters:
29919ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29929ae82921SPaul Mullowney .  m - number of rows
29939ae82921SPaul Mullowney .  n - number of columns
29949ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29959ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29960298fd71SBarry Smith          (possibly different for each row) or NULL
29979ae82921SPaul Mullowney 
29989ae82921SPaul Mullowney    Output Parameter:
29999ae82921SPaul Mullowney .  A - the matrix
30009ae82921SPaul Mullowney 
30019ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
30029ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
30039ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
30049ae82921SPaul Mullowney 
30059ae82921SPaul Mullowney    Notes:
30069ae82921SPaul Mullowney    If nnz is given then nz is ignored
30079ae82921SPaul Mullowney 
30089ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
30099ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
30109ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
30119ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
30129ae82921SPaul Mullowney 
30139ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30140298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30159ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30169ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30179ae82921SPaul Mullowney 
30189ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30199ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30209ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30219ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30229ae82921SPaul Mullowney 
30239ae82921SPaul Mullowney    Level: intermediate
30249ae82921SPaul Mullowney 
3025e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
30269ae82921SPaul Mullowney @*/
30279ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
30289ae82921SPaul Mullowney {
30299ae82921SPaul Mullowney   PetscErrorCode ierr;
30309ae82921SPaul Mullowney 
30319ae82921SPaul Mullowney   PetscFunctionBegin;
30329ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
30339ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
30349ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
30359ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
30369ae82921SPaul Mullowney   PetscFunctionReturn(0);
30379ae82921SPaul Mullowney }
30389ae82921SPaul Mullowney 
30396fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30409ae82921SPaul Mullowney {
30419ae82921SPaul Mullowney   PetscErrorCode              ierr;
30423fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3043ab25e6cbSDominic Meiser 
30449ae82921SPaul Mullowney   PetscFunctionBegin;
30459ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30463fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30473fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3048470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30499ae82921SPaul Mullowney   } else {
3050470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3051aa372e3fSPaul Mullowney   }
30523fa6b06aSMark Adams   if (d_mat) {
30533fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30543fa6b06aSMark Adams     cudaError_t                err;
30553fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30563fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30573fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30583fa6b06aSMark Adams     if (a->compressedrow.use) {
30593fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30603fa6b06aSMark Adams     }
30613fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30623fa6b06aSMark Adams   }
3063c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3064ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3065ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3066ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3067fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3068ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30697e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30707e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30719ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30729ae82921SPaul Mullowney   PetscFunctionReturn(0);
30739ae82921SPaul Mullowney }
30749ae82921SPaul Mullowney 
3075ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
307695639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30779ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30789ff858a8SKarl Rupp {
30799ff858a8SKarl Rupp   PetscErrorCode ierr;
30809ff858a8SKarl Rupp 
30819ff858a8SKarl Rupp   PetscFunctionBegin;
30829ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3083ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30849ff858a8SKarl Rupp   PetscFunctionReturn(0);
30859ff858a8SKarl Rupp }
30869ff858a8SKarl Rupp 
3087039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
308895639643SRichard Tran Mills {
3089e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3090a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3091039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3092039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3093039c6fbaSStefano Zampini   PetscScalar        *ay;
3094039c6fbaSStefano Zampini   const PetscScalar  *ax;
3095039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3096039c6fbaSStefano Zampini   cudaError_t        cerr;
3097e6e9a74fSStefano Zampini 
309895639643SRichard Tran Mills   PetscFunctionBegin;
3099a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3100a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3101039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3102a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3103a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3104a587d139SMark     PetscFunctionReturn(0);
310595639643SRichard Tran Mills   }
3106039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3107a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3108a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3109039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3110039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3111039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3112039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3113039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3114039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3115039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3116039c6fbaSStefano Zampini     if (eq) {
3117039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3118039c6fbaSStefano Zampini     }
3119039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3120039c6fbaSStefano Zampini   }
3121d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3122d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3123039c6fbaSStefano Zampini 
3124039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3125039c6fbaSStefano Zampini     cusparseStatus_t stat;
3126039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3127039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3128039c6fbaSStefano Zampini     size_t           bufferSize;
3129039c6fbaSStefano Zampini     void             *buffer;
3130039c6fbaSStefano Zampini #endif
3131039c6fbaSStefano Zampini 
3132039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3133039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3134039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3135039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3136039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3137039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3138039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3139039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3140039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3141039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3142039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3143039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3144039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3145039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3146039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3147039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3148039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3149039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3150039c6fbaSStefano Zampini #else
3151039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3152039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3153039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3154039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3155039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3156039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3157039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3158039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3159039c6fbaSStefano Zampini #endif
3160039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3161039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3162039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3163039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3164039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3165a587d139SMark     cublasHandle_t cublasv2handle;
3166039c6fbaSStefano Zampini     cublasStatus_t berr;
3167a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3168039c6fbaSStefano Zampini 
3169039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3170039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3171a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3172a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3173a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3174039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3175039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3176a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3177a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3178039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3179039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3180a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3181039c6fbaSStefano Zampini   } else {
3182a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3183d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3184a587d139SMark   }
318595639643SRichard Tran Mills   PetscFunctionReturn(0);
318695639643SRichard Tran Mills }
318795639643SRichard Tran Mills 
318833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
318933c9ba73SStefano Zampini {
319033c9ba73SStefano Zampini   PetscErrorCode ierr;
319133c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
319233c9ba73SStefano Zampini   PetscScalar    *ay;
319333c9ba73SStefano Zampini   cudaError_t    cerr;
319433c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
319533c9ba73SStefano Zampini   cublasStatus_t berr;
319633c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
319733c9ba73SStefano Zampini 
319833c9ba73SStefano Zampini   PetscFunctionBegin;
319933c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
320033c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
320133c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
320233c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
320333c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
320433c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
320533c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
320633c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
320733c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
320833c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
320933c9ba73SStefano Zampini   PetscFunctionReturn(0);
321033c9ba73SStefano Zampini }
321133c9ba73SStefano Zampini 
32123fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
32133fa6b06aSMark Adams {
32143fa6b06aSMark Adams   PetscErrorCode             ierr;
32157e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3216a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32177e8381f9SStefano Zampini 
32183fa6b06aSMark Adams   PetscFunctionBegin;
32193fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32203fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32217e8381f9SStefano Zampini     if (spptr->mat) {
32227e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
32237e8381f9SStefano Zampini       if (matrix->values) {
32247e8381f9SStefano Zampini         both = PETSC_TRUE;
32257e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32267e8381f9SStefano Zampini       }
32277e8381f9SStefano Zampini     }
32287e8381f9SStefano Zampini     if (spptr->matTranspose) {
32297e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
32307e8381f9SStefano Zampini       if (matrix->values) {
32317e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32327e8381f9SStefano Zampini       }
32337e8381f9SStefano Zampini     }
32343fa6b06aSMark Adams   }
3235a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3236a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3237a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
32387e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3239a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32403fa6b06aSMark Adams 
32413fa6b06aSMark Adams   PetscFunctionReturn(0);
32423fa6b06aSMark Adams }
32433fa6b06aSMark Adams 
3244a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3245a587d139SMark {
3246a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3247a587d139SMark   PetscErrorCode ierr;
3248a587d139SMark 
3249a587d139SMark   PetscFunctionBegin;
3250a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3251a587d139SMark   if (flg) {
3252a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3253a587d139SMark 
325433c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3255a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3256a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3257a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3258a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3259a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3260a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3261a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3262a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3263fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3264c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3265a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3266a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3267a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3268a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3269a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3270fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3271a587d139SMark   } else {
327233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3273a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3274a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3275a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3276a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3277a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3278a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3279a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3280a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3281fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3282c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3283a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3284a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3285a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3286a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3287a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3288fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3289a587d139SMark   }
3290a587d139SMark   A->boundtocpu = flg;
3291a587d139SMark   a->inode.use = flg;
3292a587d139SMark   PetscFunctionReturn(0);
3293a587d139SMark }
3294a587d139SMark 
329549735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32969ae82921SPaul Mullowney {
32979ae82921SPaul Mullowney   PetscErrorCode   ierr;
3298aa372e3fSPaul Mullowney   cusparseStatus_t stat;
329949735bf3SStefano Zampini   Mat              B;
33009ae82921SPaul Mullowney 
33019ae82921SPaul Mullowney   PetscFunctionBegin;
3302832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
330349735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
330449735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
330549735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
330649735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
330749735bf3SStefano Zampini   }
330849735bf3SStefano Zampini   B = *newmat;
330949735bf3SStefano Zampini 
331034136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
331134136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
331234136279SStefano Zampini 
331349735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33149ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3315e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3316e6e9a74fSStefano Zampini 
3317e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3318e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3319e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3320a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3321e6e9a74fSStefano Zampini       B->spptr = spptr;
33223fa6b06aSMark Adams       spptr->deviceMat = NULL;
3323*d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3324*d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3325*d8132acaSStefano Zampini       spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3326*d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3327*d8132acaSStefano Zampini #endif
33289ae82921SPaul Mullowney     } else {
3329e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3330e6e9a74fSStefano Zampini 
3331e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3332e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3333a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3334e6e9a74fSStefano Zampini       B->spptr = spptr;
33359ae82921SPaul Mullowney     }
3336e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
333749735bf3SStefano Zampini   }
3338693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
33399ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
33409ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
334195639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3342693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
33432205254eSKarl Rupp 
3344e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
33459ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3346bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33479ae82921SPaul Mullowney   PetscFunctionReturn(0);
33489ae82921SPaul Mullowney }
33499ae82921SPaul Mullowney 
335002fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
335102fe1965SBarry Smith {
335202fe1965SBarry Smith   PetscErrorCode ierr;
335302fe1965SBarry Smith 
335402fe1965SBarry Smith   PetscFunctionBegin;
335502fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33560ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3357afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3358afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3359afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
336002fe1965SBarry Smith   PetscFunctionReturn(0);
336102fe1965SBarry Smith }
336202fe1965SBarry Smith 
33633ca39a21SBarry Smith /*MC
3364e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3365e057df02SPaul Mullowney 
3366e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33672692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33682692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3369e057df02SPaul Mullowney 
3370e057df02SPaul Mullowney    Options Database Keys:
3371e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3372aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3373a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3374e057df02SPaul Mullowney 
3375e057df02SPaul Mullowney   Level: beginner
3376e057df02SPaul Mullowney 
33778468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3378e057df02SPaul Mullowney M*/
33797f756511SDominic Meiser 
338042c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
338142c9c57cSBarry Smith 
33820f39cd5aSBarry Smith 
33833ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
338442c9c57cSBarry Smith {
338542c9c57cSBarry Smith   PetscErrorCode ierr;
338642c9c57cSBarry Smith 
338742c9c57cSBarry Smith   PetscFunctionBegin;
33883ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33903ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33913ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
339242c9c57cSBarry Smith   PetscFunctionReturn(0);
339342c9c57cSBarry Smith }
339429b38603SBarry Smith 
3395470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33967f756511SDominic Meiser {
3397e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33987f756511SDominic Meiser   cusparseStatus_t stat;
33997f756511SDominic Meiser 
34007f756511SDominic Meiser   PetscFunctionBegin;
34017f756511SDominic Meiser   if (*cusparsestruct) {
3402e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3403e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
34047f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
340581902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
34067e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
34077e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3408a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
34097e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3410e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
34117f756511SDominic Meiser   }
34127f756511SDominic Meiser   PetscFunctionReturn(0);
34137f756511SDominic Meiser }
34147f756511SDominic Meiser 
34157f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
34167f756511SDominic Meiser {
34177f756511SDominic Meiser   PetscFunctionBegin;
34187f756511SDominic Meiser   if (*mat) {
34197f756511SDominic Meiser     delete (*mat)->values;
34207f756511SDominic Meiser     delete (*mat)->column_indices;
34217f756511SDominic Meiser     delete (*mat)->row_offsets;
34227f756511SDominic Meiser     delete *mat;
34237f756511SDominic Meiser     *mat = 0;
34247f756511SDominic Meiser   }
34257f756511SDominic Meiser   PetscFunctionReturn(0);
34267f756511SDominic Meiser }
34277f756511SDominic Meiser 
3428470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
34297f756511SDominic Meiser {
34307f756511SDominic Meiser   cusparseStatus_t stat;
34317f756511SDominic Meiser   PetscErrorCode   ierr;
34327f756511SDominic Meiser 
34337f756511SDominic Meiser   PetscFunctionBegin;
34347f756511SDominic Meiser   if (*trifactor) {
343557d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3436afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
34377f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
34381b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
34392cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3440afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
34411b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3442afb2bd1cSJunchao Zhang    #endif
3443da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34447f756511SDominic Meiser   }
34457f756511SDominic Meiser   PetscFunctionReturn(0);
34467f756511SDominic Meiser }
34477f756511SDominic Meiser 
3448470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34497f756511SDominic Meiser {
34507f756511SDominic Meiser   CsrMatrix        *mat;
34517f756511SDominic Meiser   cusparseStatus_t stat;
34527f756511SDominic Meiser   cudaError_t      err;
34537f756511SDominic Meiser 
34547f756511SDominic Meiser   PetscFunctionBegin;
34557f756511SDominic Meiser   if (*matstruct) {
34567f756511SDominic Meiser     if ((*matstruct)->mat) {
34577f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3458afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3459afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3460afb2bd1cSJunchao Zhang        #else
34617f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
346257d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3463afb2bd1cSJunchao Zhang        #endif
34647f756511SDominic Meiser       } else {
34657f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34667f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34677f756511SDominic Meiser       }
34687f756511SDominic Meiser     }
346957d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34707f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3471afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34727656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34737656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3474afb2bd1cSJunchao Zhang 
3475afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3476afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3477afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3478afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3479afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3480afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3481afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3482afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3483afb2bd1cSJunchao Zhang       }
3484afb2bd1cSJunchao Zhang     }
3485afb2bd1cSJunchao Zhang    #endif
34867f756511SDominic Meiser     delete *matstruct;
34877e8381f9SStefano Zampini     *matstruct = NULL;
34887f756511SDominic Meiser   }
34897f756511SDominic Meiser   PetscFunctionReturn(0);
34907f756511SDominic Meiser }
34917f756511SDominic Meiser 
3492ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34937f756511SDominic Meiser {
3494e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3495e6e9a74fSStefano Zampini 
34967f756511SDominic Meiser   PetscFunctionBegin;
34977f756511SDominic Meiser   if (*trifactors) {
3498e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3499e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3500e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3501e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
35027f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
35037f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
35047f756511SDominic Meiser     delete (*trifactors)->workVector;
35057e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
35067e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
35077e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3508ccdfe979SStefano Zampini   }
3509ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3510ccdfe979SStefano Zampini }
3511ccdfe979SStefano Zampini 
3512ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3513ccdfe979SStefano Zampini {
3514e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3515ccdfe979SStefano Zampini   cusparseHandle_t handle;
3516ccdfe979SStefano Zampini   cusparseStatus_t stat;
3517ccdfe979SStefano Zampini 
3518ccdfe979SStefano Zampini   PetscFunctionBegin;
3519ccdfe979SStefano Zampini   if (*trifactors) {
3520e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35217f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
352257d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35237f756511SDominic Meiser     }
3524e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
35257f756511SDominic Meiser   }
35267f756511SDominic Meiser   PetscFunctionReturn(0);
35277f756511SDominic Meiser }
35287e8381f9SStefano Zampini 
35297e8381f9SStefano Zampini struct IJCompare
35307e8381f9SStefano Zampini {
35317e8381f9SStefano Zampini   __host__ __device__
35327e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35337e8381f9SStefano Zampini   {
35347e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
35357e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
35367e8381f9SStefano Zampini     return false;
35377e8381f9SStefano Zampini   }
35387e8381f9SStefano Zampini };
35397e8381f9SStefano Zampini 
35407e8381f9SStefano Zampini struct IJEqual
35417e8381f9SStefano Zampini {
35427e8381f9SStefano Zampini   __host__ __device__
35437e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35447e8381f9SStefano Zampini   {
35457e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35467e8381f9SStefano Zampini     return true;
35477e8381f9SStefano Zampini   }
35487e8381f9SStefano Zampini };
35497e8381f9SStefano Zampini 
35507e8381f9SStefano Zampini struct IJDiff
35517e8381f9SStefano Zampini {
35527e8381f9SStefano Zampini   __host__ __device__
35537e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35547e8381f9SStefano Zampini   {
35557e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35567e8381f9SStefano Zampini   }
35577e8381f9SStefano Zampini };
35587e8381f9SStefano Zampini 
35597e8381f9SStefano Zampini struct IJSum
35607e8381f9SStefano Zampini {
35617e8381f9SStefano Zampini   __host__ __device__
35627e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35637e8381f9SStefano Zampini   {
35647e8381f9SStefano Zampini     return t1||t2;
35657e8381f9SStefano Zampini   }
35667e8381f9SStefano Zampini };
35677e8381f9SStefano Zampini 
35687e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3569e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35707e8381f9SStefano Zampini {
35717e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3572fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3573bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
357408391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35757e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35767e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35777e8381f9SStefano Zampini   cudaError_t                           cerr;
35787e8381f9SStefano Zampini   PetscInt                              n;
35797e8381f9SStefano Zampini 
35807e8381f9SStefano Zampini   PetscFunctionBegin;
35817e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35827e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35837e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35847e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35857e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35867e8381f9SStefano Zampini     PetscFunctionReturn(0);
35877e8381f9SStefano Zampini   }
35887e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35897e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3590e61fc153SStefano Zampini   if (!v) {
3591e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3592e61fc153SStefano Zampini     goto finalize;
35937e8381f9SStefano Zampini   }
3594e61fc153SStefano Zampini   n = cusp->cooPerm->size();
359508391a17SStefano Zampini   if (isCudaMem(v)) {
359608391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
359708391a17SStefano Zampini   } else {
3598e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3599e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
360008391a17SStefano Zampini     d_v = cooPerm_v->data();
3601e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
360208391a17SStefano Zampini   }
3603bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3604e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
36057e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3606bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
360708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3608e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3609e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3610e61fc153SStefano Zampini       delete cooPerm_w;
36117e8381f9SStefano Zampini     } else {
361208391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36137e8381f9SStefano Zampini                                                                 matrix->values->begin()));
361408391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36157e8381f9SStefano Zampini                                                                 matrix->values->end()));
36167e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
36177e8381f9SStefano Zampini     }
36187e8381f9SStefano Zampini   } else {
3619e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
362008391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3621e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36227e8381f9SStefano Zampini     } else {
362308391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36247e8381f9SStefano Zampini                                                                 matrix->values->begin()));
362508391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36267e8381f9SStefano Zampini                                                                 matrix->values->end()));
36277e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
36287e8381f9SStefano Zampini     }
36297e8381f9SStefano Zampini   }
36307e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3631bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3632e61fc153SStefano Zampini finalize:
3633e61fc153SStefano Zampini   delete cooPerm_v;
36347e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3635e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3636fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3637fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3638fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3639fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3640fcdce8c4SStefano Zampini   a->reallocs         = 0;
3641fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3642fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3643fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3644fcdce8c4SStefano Zampini   A->num_ass++;
36457e8381f9SStefano Zampini   PetscFunctionReturn(0);
36467e8381f9SStefano Zampini }
36477e8381f9SStefano Zampini 
3648a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3649a49f1ed0SStefano Zampini {
3650a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3651a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3652a49f1ed0SStefano Zampini 
3653a49f1ed0SStefano Zampini   PetscFunctionBegin;
3654a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3655a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3656a49f1ed0SStefano Zampini   if (destroy) {
3657a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3658a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3659a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3660a49f1ed0SStefano Zampini   }
3661a49f1ed0SStefano Zampini   cusp->transupdated = PETSC_FALSE;
3662a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3663a49f1ed0SStefano Zampini }
3664a49f1ed0SStefano Zampini 
36657e8381f9SStefano Zampini #include <thrust/binary_search.h>
3666e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36677e8381f9SStefano Zampini {
36687e8381f9SStefano Zampini   PetscErrorCode     ierr;
36697e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36707e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36717e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36727e8381f9SStefano Zampini   cudaError_t        cerr;
36737e8381f9SStefano Zampini 
36747e8381f9SStefano Zampini   PetscFunctionBegin;
36757e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36767e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36777e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36787e8381f9SStefano Zampini   if (n != cooPerm_n) {
36797e8381f9SStefano Zampini     delete cusp->cooPerm;
36807e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36817e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36827e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
36837e8381f9SStefano Zampini   }
36847e8381f9SStefano Zampini   if (n) {
36857e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
36867e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
36877e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
36887e8381f9SStefano Zampini 
36897e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
36907e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36917e8381f9SStefano Zampini 
36927e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36937e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36947e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36957e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36967e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36977e8381f9SStefano Zampini 
369808391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36997e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
37007e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
37017e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
37027e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
37037e8381f9SStefano Zampini 
37047e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
37057e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
37067e8381f9SStefano Zampini       delete cusp->cooPerm_a;
37077e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
37087e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
37097e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
37107e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
37117e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
37127e8381f9SStefano Zampini       w[0] = 0;
37137e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
37147e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
37157e8381f9SStefano Zampini     }
37167e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
37177e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
37187e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
37197e8381f9SStefano Zampini                         ii.begin());
372008391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
372108391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37227e8381f9SStefano Zampini 
37237e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
37247e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
37257e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
37267e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
37277e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
37287e8381f9SStefano Zampini     a->i[0] = 0;
37297e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37307e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3731fcdce8c4SStefano Zampini     a->rmax = 0;
37327e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
37337e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
37347e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37357e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
37367e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
37377e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
37387e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
37397e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
37407e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3741fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
37427e8381f9SStefano Zampini     }
3743fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
37447e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
37457e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3746fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
37477e8381f9SStefano Zampini   } else {
37487e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
37497e8381f9SStefano Zampini   }
3750e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
37517e8381f9SStefano Zampini 
37527e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3753e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3754e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
37557e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
37567e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
37577e8381f9SStefano Zampini   A->nonzerostate++;
37587e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3759a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
37607e8381f9SStefano Zampini 
37617e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37627e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37637e8381f9SStefano Zampini   PetscFunctionReturn(0);
37647e8381f9SStefano Zampini }
3765ed502f03SStefano Zampini 
3766ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3767ed502f03SStefano Zampini {
3768ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3769ed502f03SStefano Zampini   CsrMatrix          *csr;
3770ed502f03SStefano Zampini   PetscErrorCode     ierr;
3771ed502f03SStefano Zampini 
3772ed502f03SStefano Zampini   PetscFunctionBegin;
3773ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3774ed502f03SStefano Zampini   PetscValidPointer(a,2);
3775ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3776ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3777ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
377833c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3779ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3780ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3781ed502f03SStefano Zampini   *a = csr->values->data().get();
3782ed502f03SStefano Zampini   PetscFunctionReturn(0);
3783ed502f03SStefano Zampini }
3784ed502f03SStefano Zampini 
3785ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3786ed502f03SStefano Zampini {
3787ed502f03SStefano Zampini   PetscFunctionBegin;
3788ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3789ed502f03SStefano Zampini   PetscValidPointer(a,2);
3790ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3791ed502f03SStefano Zampini   *a = NULL;
3792ed502f03SStefano Zampini   PetscFunctionReturn(0);
3793ed502f03SStefano Zampini }
3794ed502f03SStefano Zampini 
3795039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3796039c6fbaSStefano Zampini {
3797039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3798039c6fbaSStefano Zampini   CsrMatrix          *csr;
3799039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3800039c6fbaSStefano Zampini 
3801039c6fbaSStefano Zampini   PetscFunctionBegin;
3802039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3803039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3804039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3805039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3806039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
380733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3808039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3809039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3810039c6fbaSStefano Zampini   *a = csr->values->data().get();
3811039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3812a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3813039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3814039c6fbaSStefano Zampini }
3815039c6fbaSStefano Zampini 
3816039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3817039c6fbaSStefano Zampini {
3818039c6fbaSStefano Zampini   PetscErrorCode ierr;
3819039c6fbaSStefano Zampini 
3820039c6fbaSStefano Zampini   PetscFunctionBegin;
3821039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3822039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3823039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3824039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3825039c6fbaSStefano Zampini   *a = NULL;
3826039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3827039c6fbaSStefano Zampini }
3828039c6fbaSStefano Zampini 
3829ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3830ed502f03SStefano Zampini {
3831ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3832ed502f03SStefano Zampini   CsrMatrix          *csr;
3833a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3834ed502f03SStefano Zampini 
3835ed502f03SStefano Zampini   PetscFunctionBegin;
3836ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3837ed502f03SStefano Zampini   PetscValidPointer(a,2);
3838ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3839ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
384033c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3841ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3842ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3843ed502f03SStefano Zampini   *a = csr->values->data().get();
3844039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3845a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3846ed502f03SStefano Zampini   PetscFunctionReturn(0);
3847ed502f03SStefano Zampini }
3848ed502f03SStefano Zampini 
3849ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3850ed502f03SStefano Zampini {
3851ed502f03SStefano Zampini   PetscErrorCode ierr;
3852ed502f03SStefano Zampini 
3853ed502f03SStefano Zampini   PetscFunctionBegin;
3854ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3855ed502f03SStefano Zampini   PetscValidPointer(a,2);
3856ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3857ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3858ed502f03SStefano Zampini   *a = NULL;
3859ed502f03SStefano Zampini   PetscFunctionReturn(0);
3860ed502f03SStefano Zampini }
3861ed502f03SStefano Zampini 
3862ed502f03SStefano Zampini struct IJCompare4
3863ed502f03SStefano Zampini {
3864ed502f03SStefano Zampini   __host__ __device__
38652ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3866ed502f03SStefano Zampini   {
3867ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3868ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3869ed502f03SStefano Zampini     return false;
3870ed502f03SStefano Zampini   }
3871ed502f03SStefano Zampini };
3872ed502f03SStefano Zampini 
38738909a122SStefano Zampini struct Shift
38748909a122SStefano Zampini {
3875ed502f03SStefano Zampini   int _shift;
3876ed502f03SStefano Zampini 
3877ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3878ed502f03SStefano Zampini   __host__ __device__
3879ed502f03SStefano Zampini   inline int operator() (const int &c)
3880ed502f03SStefano Zampini   {
3881ed502f03SStefano Zampini     return c + _shift;
3882ed502f03SStefano Zampini   }
3883ed502f03SStefano Zampini };
3884ed502f03SStefano Zampini 
3885ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3886ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3887ed502f03SStefano Zampini {
3888ed502f03SStefano Zampini   PetscErrorCode               ierr;
3889ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3890ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3891ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3892ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3893ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3894ed502f03SStefano Zampini   cusparseStatus_t             stat;
3895ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3896ed502f03SStefano Zampini   cudaError_t                  cerr;
3897ed502f03SStefano Zampini 
3898ed502f03SStefano Zampini   PetscFunctionBegin;
3899ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3900ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3901ed502f03SStefano Zampini   PetscValidPointer(C,4);
3902ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3903ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3904ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3905ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3906ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3907ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3908ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3909ed502f03SStefano Zampini     m     = A->rmap->n;
3910ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3911ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3912ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3913ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3914ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3915ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3916ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3917ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3918ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3919ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3920ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3921ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3922ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3923ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3924ed502f03SStefano Zampini     Ccusp->nrows    = m;
3925ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3926ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3927ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3928ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3929ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3930ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3931ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3932ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3933ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3934ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3935ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3937ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3938ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3939ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3940ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3941ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3942ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3943ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3944ed502f03SStefano Zampini 
3945ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3946ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3947ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3948ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3949ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3950ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3951ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3952ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3953ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3954ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3955ed502f03SStefano Zampini     if (c->nz) {
39562ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
39572ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
39582ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
39592ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
39602ed87e7eSStefano Zampini 
3961ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3962ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3963ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3964ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3965ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3966ed502f03SStefano Zampini         }
39672ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39682ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3969ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3970ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3971ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3972ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3973ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3974ed502f03SStefano Zampini         }
39752ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39762ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3977ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39782ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39792ed87e7eSStefano Zampini                               Aroff->data().get(),
39802ed87e7eSStefano Zampini                               Annz,
39812ed87e7eSStefano Zampini                               m,
39822ed87e7eSStefano Zampini                               Acoo->data().get(),
39832ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3984ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
39852ed87e7eSStefano Zampini                               Broff->data().get(),
3986ed502f03SStefano Zampini                               Bnnz,
3987ed502f03SStefano Zampini                               m,
39882ed87e7eSStefano Zampini                               Bcoo->data().get(),
3989ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
39902ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
39912ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
39922ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
39938909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3994ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3995ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39968909a122SStefano Zampini #else
39978909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39988909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39998909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
40008909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
40018909a122SStefano Zampini #endif
40022ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
40032ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
40042ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
40052ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
40062ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
40072ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4008ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4009ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4010ed502f03SStefano Zampini       thrust::advance(p2,Annz);
40112ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
40128909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
40138909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
40148909a122SStefano Zampini #endif
40152ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
40162ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
40172ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
40182ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
40192ed87e7eSStefano Zampini #else
40202ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40212ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40222ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40232ed87e7eSStefano Zampini #endif
4024ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
40252ed87e7eSStefano Zampini                               Ccoo->data().get(),
4026ed502f03SStefano Zampini                               c->nz,
4027ed502f03SStefano Zampini                               m,
4028ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4029ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4030ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4031ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40322ed87e7eSStefano Zampini       delete wPerm;
40332ed87e7eSStefano Zampini       delete Acoo;
40342ed87e7eSStefano Zampini       delete Bcoo;
40352ed87e7eSStefano Zampini       delete Ccoo;
4036ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4037ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4038ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4039ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4040ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4041ed502f03SStefano Zampini #endif
4042ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
4043ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4044ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4045ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4046ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4047ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4048ed502f03SStefano Zampini 
4049ed502f03SStefano Zampini         Ccusp->transgen = PETSC_TRUE;
4050a49f1ed0SStefano Zampini         Ccusp->transupdated = PETSC_TRUE;
4051a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4052ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4053ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4054ed502f03SStefano Zampini         CcsrT->num_rows = n;
4055ed502f03SStefano Zampini         CcsrT->num_cols = m;
4056ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4057ed502f03SStefano Zampini 
4058ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4059ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4060ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4061ed502f03SStefano Zampini 
4062ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4063ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4064ed502f03SStefano Zampini         if (AT) {
4065ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4066ed502f03SStefano Zampini           thrust::advance(rT,-1);
4067ed502f03SStefano Zampini         }
4068ed502f03SStefano Zampini         if (BT) {
4069ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4070ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4071ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4072ed502f03SStefano Zampini         }
4073ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4074ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4075ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4076ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4077ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4078ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4079ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4080ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4081ed502f03SStefano Zampini 
4082ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4083ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4084ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4085ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4086ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4087ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4088ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4090ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4091ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4092ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4093ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4094ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4095ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4096ed502f03SStefano Zampini #endif
4097ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4098ed502f03SStefano Zampini       }
4099ed502f03SStefano Zampini     }
4100ed502f03SStefano Zampini 
4101ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4102ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4103ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4104ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4105ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4106ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4107ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4108ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4109ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4110ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4111ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4112ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4113ed502f03SStefano Zampini     } else {
4114ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4115ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4116ed502f03SStefano Zampini     }
4117ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4118ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4119ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4120ed502f03SStefano Zampini     c->maxnz = c->nz;
4121ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4122ed502f03SStefano Zampini     c->rmax = 0;
4123ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4124ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4125ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4126ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4127ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4128ed502f03SStefano Zampini     }
4129ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4130ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4131ed502f03SStefano Zampini     (*C)->nonzerostate++;
4132ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4133ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4134ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4135ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4136ed502f03SStefano Zampini   } else {
4137ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4138ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4139ed502f03SStefano Zampini     if (c->nz) {
4140ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4141ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4142ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4143ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4144ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4145ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4146ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4147ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4148ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4149ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4150ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4151ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4152ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4153ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4154ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4155ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4156ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4157ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4158ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4159ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4160ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4161ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4162ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4163ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4164ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4165ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4166ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4167ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4168ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4169a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4170ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4171ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4172ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4173ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4174ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4175ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4176ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4177ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4178ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4179a49f1ed0SStefano Zampini         Ccusp->transupdated = PETSC_TRUE;
4180ed502f03SStefano Zampini       }
4181ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4182ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4183ed502f03SStefano Zampini     }
4184ed502f03SStefano Zampini   }
4185ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4186ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4187ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4188ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4189ed502f03SStefano Zampini   PetscFunctionReturn(0);
4190ed502f03SStefano Zampini }
4191c215019aSStefano Zampini 
4192c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4193c215019aSStefano Zampini {
4194c215019aSStefano Zampini   PetscErrorCode    ierr;
4195c215019aSStefano Zampini   bool              dmem;
4196c215019aSStefano Zampini   const PetscScalar *av;
4197c215019aSStefano Zampini   cudaError_t       cerr;
4198c215019aSStefano Zampini 
4199c215019aSStefano Zampini   PetscFunctionBegin;
4200c215019aSStefano Zampini   dmem = isCudaMem(v);
4201c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4202c215019aSStefano Zampini   if (n && idx) {
4203c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4204c215019aSStefano Zampini     widx.assign(idx,idx+n);
4205c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4206c215019aSStefano Zampini 
4207c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4208c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4209c215019aSStefano Zampini     if (dmem) {
4210c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4211c215019aSStefano Zampini     } else {
4212c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4213c215019aSStefano Zampini       dv = w->data();
4214c215019aSStefano Zampini     }
4215c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4216c215019aSStefano Zampini 
4217c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4218c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4219c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4220c215019aSStefano Zampini     if (w) {
4221c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4222c215019aSStefano Zampini     }
4223c215019aSStefano Zampini     delete w;
4224c215019aSStefano Zampini   } else {
4225c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4226c215019aSStefano Zampini   }
4227c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4228c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4229c215019aSStefano Zampini   PetscFunctionReturn(0);
4230c215019aSStefano Zampini }
4231