xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d2be01edfa4015c830ecdb394101377247ef7878)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
769ae82921SPaul Mullowney 
777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
837f756511SDominic Meiser 
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
8657181aedSStefano Zampini 
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
897e8381f9SStefano Zampini 
90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91c215019aSStefano Zampini 
92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93b06137fdSPaul Mullowney {
94b06137fdSPaul Mullowney   cusparseStatus_t   stat;
95b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney   PetscFunctionBegin;
98d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10057d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101b06137fdSPaul Mullowney   PetscFunctionReturn(0);
102b06137fdSPaul Mullowney }
103b06137fdSPaul Mullowney 
104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105b06137fdSPaul Mullowney {
106b06137fdSPaul Mullowney   cusparseStatus_t   stat;
107b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney   PetscFunctionBegin;
110d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1116b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11216a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11357d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11416a2e217SAlejandro Lamas Daviña     }
115b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1166b1cf21dSAlejandro Lamas Daviña   }
11757d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118b06137fdSPaul Mullowney   PetscFunctionReturn(0);
119b06137fdSPaul Mullowney }
120b06137fdSPaul Mullowney 
121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122b06137fdSPaul Mullowney {
123b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1247e8381f9SStefano Zampini   PetscBool          flg;
1257e8381f9SStefano Zampini   PetscErrorCode     ierr;
126ccdfe979SStefano Zampini 
127b06137fdSPaul Mullowney   PetscFunctionBegin;
1287e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1297e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131b06137fdSPaul Mullowney   PetscFunctionReturn(0);
132b06137fdSPaul Mullowney }
133b06137fdSPaul Mullowney 
134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscFunctionBegin;
1379ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1389ae82921SPaul Mullowney   PetscFunctionReturn(0);
1399ae82921SPaul Mullowney }
1409ae82921SPaul Mullowney 
141c708e6cdSJed Brown /*MC
142087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
148c708e6cdSJed Brown 
1499ae82921SPaul Mullowney   Level: beginner
150c708e6cdSJed Brown 
1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152c708e6cdSJed Brown M*/
1539ae82921SPaul Mullowney 
15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1559ae82921SPaul Mullowney {
1569ae82921SPaul Mullowney   PetscErrorCode ierr;
157bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1589ae82921SPaul Mullowney 
1599ae82921SPaul Mullowney   PetscFunctionBegin;
160bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1622c7c0729SBarry Smith   (*B)->factortype = ftype;
1632c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1649ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1652205254eSKarl Rupp 
166087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16733d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1689ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1699ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
171087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
172087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1739ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
174bc3f50f2SPaul Mullowney 
175fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1763ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1779ae82921SPaul Mullowney   PetscFunctionReturn(0);
1789ae82921SPaul Mullowney }
1799ae82921SPaul Mullowney 
180bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
181ca45077fSPaul Mullowney {
182aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1836e111a19SKarl Rupp 
184ca45077fSPaul Mullowney   PetscFunctionBegin;
185ca45077fSPaul Mullowney   switch (op) {
186e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
187aa372e3fSPaul Mullowney     cusparsestruct->format = format;
188ca45077fSPaul Mullowney     break;
189e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
190aa372e3fSPaul Mullowney     cusparsestruct->format = format;
191ca45077fSPaul Mullowney     break;
192ca45077fSPaul Mullowney   default:
19336d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
194ca45077fSPaul Mullowney   }
195ca45077fSPaul Mullowney   PetscFunctionReturn(0);
196ca45077fSPaul Mullowney }
1979ae82921SPaul Mullowney 
198e057df02SPaul Mullowney /*@
199e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
200e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
201aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
202e057df02SPaul Mullowney    Not Collective
203e057df02SPaul Mullowney 
204e057df02SPaul Mullowney    Input Parameters:
2058468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20636d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2072692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
208e057df02SPaul Mullowney 
209e057df02SPaul Mullowney    Output Parameter:
210e057df02SPaul Mullowney 
211e057df02SPaul Mullowney    Level: intermediate
212e057df02SPaul Mullowney 
2138468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
214e057df02SPaul Mullowney @*/
215e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
216e057df02SPaul Mullowney {
217e057df02SPaul Mullowney   PetscErrorCode ierr;
2186e111a19SKarl Rupp 
219e057df02SPaul Mullowney   PetscFunctionBegin;
220e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
221e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
222e057df02SPaul Mullowney   PetscFunctionReturn(0);
223e057df02SPaul Mullowney }
224e057df02SPaul Mullowney 
225e6e9a74fSStefano Zampini /*@
226e589036eSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
227e6e9a74fSStefano Zampini 
228e6e9a74fSStefano Zampini    Collective on mat
229e6e9a74fSStefano Zampini 
230e6e9a74fSStefano Zampini    Input Parameters:
231e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
232e6e9a74fSStefano Zampini -  transgen - the boolean flag
233e6e9a74fSStefano Zampini 
234e6e9a74fSStefano Zampini    Level: intermediate
235e6e9a74fSStefano Zampini 
236e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
237e6e9a74fSStefano Zampini @*/
238e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
239e6e9a74fSStefano Zampini {
240e6e9a74fSStefano Zampini   PetscErrorCode ierr;
241e6e9a74fSStefano Zampini   PetscBool      flg;
242e6e9a74fSStefano Zampini 
243e6e9a74fSStefano Zampini   PetscFunctionBegin;
244e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
246e6e9a74fSStefano Zampini   if (flg) {
247e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24854da937aSStefano Zampini 
249e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
250e6e9a74fSStefano Zampini     cusp->transgen = transgen;
25154da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
25254da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
25354da937aSStefano Zampini     }
254e6e9a74fSStefano Zampini   }
255e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
256e6e9a74fSStefano Zampini }
257e6e9a74fSStefano Zampini 
2584416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2599ae82921SPaul Mullowney {
2609ae82921SPaul Mullowney   PetscErrorCode           ierr;
261e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2629ae82921SPaul Mullowney   PetscBool                flg;
263a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2646e111a19SKarl Rupp 
2659ae82921SPaul Mullowney   PetscFunctionBegin;
266e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2679ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26854da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26954da937aSStefano Zampini 
27054da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
271afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
272afb2bd1cSJunchao Zhang 
273e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
274a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
275afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
276afb2bd1cSJunchao Zhang 
2774c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
278a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
279afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
280afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
281afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
282afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
283afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
284afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
285afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
286afb2bd1cSJunchao Zhang 
287afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
288afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
289afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
290afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
291afb2bd1cSJunchao Zhang 
292afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
293afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
294afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
295afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
296afb2bd1cSJunchao Zhang    #endif
2974c87dfd4SPaul Mullowney   }
2980af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2999ae82921SPaul Mullowney   PetscFunctionReturn(0);
3009ae82921SPaul Mullowney }
3019ae82921SPaul Mullowney 
3026fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3039ae82921SPaul Mullowney {
304da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3059ae82921SPaul Mullowney   PetscErrorCode               ierr;
3069ae82921SPaul Mullowney 
3079ae82921SPaul Mullowney   PetscFunctionBegin;
308da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3099ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3109ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3119ae82921SPaul Mullowney   PetscFunctionReturn(0);
3129ae82921SPaul Mullowney }
3139ae82921SPaul Mullowney 
3146fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3159ae82921SPaul Mullowney {
316da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3179ae82921SPaul Mullowney   PetscErrorCode               ierr;
3189ae82921SPaul Mullowney 
3199ae82921SPaul Mullowney   PetscFunctionBegin;
320da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3219ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3229ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3239ae82921SPaul Mullowney   PetscFunctionReturn(0);
3249ae82921SPaul Mullowney }
3259ae82921SPaul Mullowney 
326087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
327087f3262SPaul Mullowney {
328da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
329087f3262SPaul Mullowney   PetscErrorCode               ierr;
330087f3262SPaul Mullowney 
331087f3262SPaul Mullowney   PetscFunctionBegin;
332da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
333087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
334087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
335087f3262SPaul Mullowney   PetscFunctionReturn(0);
336087f3262SPaul Mullowney }
337087f3262SPaul Mullowney 
338087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
339087f3262SPaul Mullowney {
340da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
341087f3262SPaul Mullowney   PetscErrorCode               ierr;
342087f3262SPaul Mullowney 
343087f3262SPaul Mullowney   PetscFunctionBegin;
344da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
345087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
346087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
347087f3262SPaul Mullowney   PetscFunctionReturn(0);
348087f3262SPaul Mullowney }
349087f3262SPaul Mullowney 
350087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3519ae82921SPaul Mullowney {
3529ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3539ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3549ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
355aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3569ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3579ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3589ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3599ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3609ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
361b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
36257d48284SJunchao Zhang   cudaError_t                       cerr;
3639ae82921SPaul Mullowney 
3649ae82921SPaul Mullowney   PetscFunctionBegin;
365cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
366c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3679ae82921SPaul Mullowney     try {
3689ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3699ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
370da79fbbcSStefano Zampini       if (!loTriFactor) {
3712cbc15d9SMark         PetscScalar                       *AALo;
3722cbc15d9SMark 
3732cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3749ae82921SPaul Mullowney 
3759ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3789ae82921SPaul Mullowney 
3799ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3809ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3819ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3829ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3839ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3849ae82921SPaul Mullowney         v        = aa;
3859ae82921SPaul Mullowney         vi       = aj;
3869ae82921SPaul Mullowney         offset   = 1;
3879ae82921SPaul Mullowney         rowOffset= 1;
3889ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3899ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
390e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3919ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3929ae82921SPaul Mullowney           rowOffset += nz+1;
3939ae82921SPaul Mullowney 
394580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
395580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3969ae82921SPaul Mullowney 
3979ae82921SPaul Mullowney           offset      += nz;
3989ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3999ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4009ae82921SPaul Mullowney           offset      += 1;
4019ae82921SPaul Mullowney 
4029ae82921SPaul Mullowney           v  += nz;
4039ae82921SPaul Mullowney           vi += nz;
4049ae82921SPaul Mullowney         }
4052205254eSKarl Rupp 
406aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
407da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
408da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
409aa372e3fSPaul Mullowney         /* Create the matrix description */
41057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
41157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4121b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
413afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
414afb2bd1cSJunchao Zhang        #else
41557d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
416afb2bd1cSJunchao Zhang        #endif
41757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
419aa372e3fSPaul Mullowney 
420aa372e3fSPaul Mullowney         /* set the operation */
421aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
422aa372e3fSPaul Mullowney 
423aa372e3fSPaul Mullowney         /* set the matrix */
424aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
425aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
427aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
428aa372e3fSPaul Mullowney 
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
430aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
431aa372e3fSPaul Mullowney 
432aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
433aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
434aa372e3fSPaul Mullowney 
435aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
436aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
437aa372e3fSPaul Mullowney 
438afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
439da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
440afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4411b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
442afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
443afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
444afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
445afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
446afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
447afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
448afb2bd1cSJunchao Zhang       #endif
449afb2bd1cSJunchao Zhang 
450aa372e3fSPaul Mullowney         /* perform the solve analysis */
451aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
452aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
453aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
454afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4551b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
456afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
457afb2bd1cSJunchao Zhang                                #endif
458afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
459da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
460da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
461aa372e3fSPaul Mullowney 
462da79fbbcSStefano Zampini         /* assign the pointer */
463aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4642cbc15d9SMark         loTriFactor->AA_h = AALo;
46557d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46657d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4674863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
468da79fbbcSStefano Zampini       } else { /* update values only */
4692cbc15d9SMark         if (!loTriFactor->AA_h) {
4702cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4712cbc15d9SMark         }
472da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4732cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
474da79fbbcSStefano Zampini         v        = aa;
475da79fbbcSStefano Zampini         vi       = aj;
476da79fbbcSStefano Zampini         offset   = 1;
477da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
478da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4792cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
480da79fbbcSStefano Zampini           offset      += nz;
4812cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
482da79fbbcSStefano Zampini           offset      += 1;
483da79fbbcSStefano Zampini           v  += nz;
484da79fbbcSStefano Zampini         }
4852cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
486da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
487da79fbbcSStefano Zampini       }
4889ae82921SPaul Mullowney     } catch(char *ex) {
4899ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4909ae82921SPaul Mullowney     }
4919ae82921SPaul Mullowney   }
4929ae82921SPaul Mullowney   PetscFunctionReturn(0);
4939ae82921SPaul Mullowney }
4949ae82921SPaul Mullowney 
495087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4969ae82921SPaul Mullowney {
4979ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4989ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4999ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
500aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5019ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5029ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5039ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5049ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5059ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5069ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50757d48284SJunchao Zhang   cudaError_t                       cerr;
5089ae82921SPaul Mullowney 
5099ae82921SPaul Mullowney   PetscFunctionBegin;
510cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
511c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5129ae82921SPaul Mullowney     try {
5139ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5149ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
515da79fbbcSStefano Zampini       if (!upTriFactor) {
5162cbc15d9SMark         PetscScalar *AAUp;
5172cbc15d9SMark 
5182cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5192cbc15d9SMark 
5209ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
52157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
52257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5239ae82921SPaul Mullowney 
5249ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5259ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5269ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5279ae82921SPaul Mullowney         offset = nzUpper;
5289ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5299ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5309ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5319ae82921SPaul Mullowney 
532e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5339ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5349ae82921SPaul Mullowney 
535e057df02SPaul Mullowney           /* decrement the offset */
5369ae82921SPaul Mullowney           offset -= (nz+1);
5379ae82921SPaul Mullowney 
538e057df02SPaul Mullowney           /* first, set the diagonal elements */
5399ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
54009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5419ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5429ae82921SPaul Mullowney 
543580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
544580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5459ae82921SPaul Mullowney         }
5462205254eSKarl Rupp 
547aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
548da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
549da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5502205254eSKarl Rupp 
551aa372e3fSPaul Mullowney         /* Create the matrix description */
55257d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
55357d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5541b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
555afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
556afb2bd1cSJunchao Zhang        #else
55757d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
558afb2bd1cSJunchao Zhang        #endif
55957d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
56057d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
561aa372e3fSPaul Mullowney 
562aa372e3fSPaul Mullowney         /* set the operation */
563aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
564aa372e3fSPaul Mullowney 
565aa372e3fSPaul Mullowney         /* set the matrix */
566aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
567aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
569aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
570aa372e3fSPaul Mullowney 
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
572aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
573aa372e3fSPaul Mullowney 
574aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
575aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
576aa372e3fSPaul Mullowney 
577aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
578aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
579aa372e3fSPaul Mullowney 
580afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
581da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
582afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5831b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
584afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
585afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
588afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
589afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
590afb2bd1cSJunchao Zhang       #endif
591afb2bd1cSJunchao Zhang 
592aa372e3fSPaul Mullowney         /* perform the solve analysis */
593aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
594aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
595aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
596afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5971b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
598afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
599afb2bd1cSJunchao Zhang                                #endif
600afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
601da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
602da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
603aa372e3fSPaul Mullowney 
604da79fbbcSStefano Zampini         /* assign the pointer */
605aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6062cbc15d9SMark         upTriFactor->AA_h = AAUp;
60757d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60857d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6094863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
610da79fbbcSStefano Zampini       } else {
6112cbc15d9SMark         if (!upTriFactor->AA_h) {
6122cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6132cbc15d9SMark         }
614da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
615da79fbbcSStefano Zampini         offset = nzUpper;
616da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
617da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
618da79fbbcSStefano Zampini 
619da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
620da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
621da79fbbcSStefano Zampini 
622da79fbbcSStefano Zampini           /* decrement the offset */
623da79fbbcSStefano Zampini           offset -= (nz+1);
624da79fbbcSStefano Zampini 
625da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6262cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6272cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
628da79fbbcSStefano Zampini         }
6292cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
630da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
631da79fbbcSStefano Zampini       }
6329ae82921SPaul Mullowney     } catch(char *ex) {
6339ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6349ae82921SPaul Mullowney     }
6359ae82921SPaul Mullowney   }
6369ae82921SPaul Mullowney   PetscFunctionReturn(0);
6379ae82921SPaul Mullowney }
6389ae82921SPaul Mullowney 
639087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6409ae82921SPaul Mullowney {
6419ae82921SPaul Mullowney   PetscErrorCode               ierr;
6429ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6439ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6449ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6459ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6469ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6479ae82921SPaul Mullowney 
6489ae82921SPaul Mullowney   PetscFunctionBegin;
649da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
650087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
651087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6522205254eSKarl Rupp 
653da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
654aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6559ae82921SPaul Mullowney 
656c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
657e057df02SPaul Mullowney   /* lower triangular indices */
6589ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
659da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
660da79fbbcSStefano Zampini     const PetscInt *r;
661da79fbbcSStefano Zampini 
662da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
663aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
664aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6659ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
666da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
667da79fbbcSStefano Zampini   }
6689ae82921SPaul Mullowney 
669e057df02SPaul Mullowney   /* upper triangular indices */
6709ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
671da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
672da79fbbcSStefano Zampini     const PetscInt *c;
673da79fbbcSStefano Zampini 
674da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
675aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6779ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
678da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
679da79fbbcSStefano Zampini   }
6809ae82921SPaul Mullowney   PetscFunctionReturn(0);
6819ae82921SPaul Mullowney }
6829ae82921SPaul Mullowney 
683087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
684087f3262SPaul Mullowney {
685087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
686087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
687aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
688aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
689087f3262SPaul Mullowney   cusparseStatus_t                  stat;
690087f3262SPaul Mullowney   PetscErrorCode                    ierr;
69157d48284SJunchao Zhang   cudaError_t                       cerr;
692087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
693087f3262SPaul Mullowney   PetscScalar                       *AAUp;
694087f3262SPaul Mullowney   PetscScalar                       *AALo;
695087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
696087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
697087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
698087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
699087f3262SPaul Mullowney 
700087f3262SPaul Mullowney   PetscFunctionBegin;
701cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
702c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
703087f3262SPaul Mullowney     try {
704da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
706da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
707087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
70957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
710087f3262SPaul Mullowney 
711087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
712087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
713087f3262SPaul Mullowney         AiUp[n]=nzUpper;
714087f3262SPaul Mullowney         offset = 0;
715087f3262SPaul Mullowney         for (i=0; i<n; i++) {
716087f3262SPaul Mullowney           /* set the pointers */
717087f3262SPaul Mullowney           v  = aa + ai[i];
718087f3262SPaul Mullowney           vj = aj + ai[i];
719087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
720087f3262SPaul Mullowney 
721087f3262SPaul Mullowney           /* first, set the diagonal elements */
722087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
72309f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
724087f3262SPaul Mullowney           AiUp[i]      = offset;
72509f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
726087f3262SPaul Mullowney 
727087f3262SPaul Mullowney           offset+=1;
728087f3262SPaul Mullowney           if (nz>0) {
729f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
730580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
731087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
732087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
733087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
734087f3262SPaul Mullowney             }
735087f3262SPaul Mullowney             offset+=nz;
736087f3262SPaul Mullowney           }
737087f3262SPaul Mullowney         }
738087f3262SPaul Mullowney 
739aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
740da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
741da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
742087f3262SPaul Mullowney 
743aa372e3fSPaul Mullowney         /* Create the matrix description */
74457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
74557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7461b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
748afb2bd1cSJunchao Zhang        #else
74957d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
750afb2bd1cSJunchao Zhang        #endif
75157d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
75257d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
753087f3262SPaul Mullowney 
754aa372e3fSPaul Mullowney         /* set the matrix */
755aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
756aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
758aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
759aa372e3fSPaul Mullowney 
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
761aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
762aa372e3fSPaul Mullowney 
763aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
764aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
765aa372e3fSPaul Mullowney 
766aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
767aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
768aa372e3fSPaul Mullowney 
769afb2bd1cSJunchao Zhang         /* set the operation */
770afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771afb2bd1cSJunchao Zhang 
772afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
773da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
774afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7751b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
776afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
777afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
778afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
779afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
780afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
781afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
782afb2bd1cSJunchao Zhang       #endif
783afb2bd1cSJunchao Zhang 
784aa372e3fSPaul Mullowney         /* perform the solve analysis */
785aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
786aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
787aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
788afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7891b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
790afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
791afb2bd1cSJunchao Zhang                                 #endif
792afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
793da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
794da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
795aa372e3fSPaul Mullowney 
796da79fbbcSStefano Zampini         /* assign the pointer */
797aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
798aa372e3fSPaul Mullowney 
799aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
800da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
801da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
802aa372e3fSPaul Mullowney 
803aa372e3fSPaul Mullowney         /* Create the matrix description */
80457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
80557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8061b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
807afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
808afb2bd1cSJunchao Zhang        #else
80957d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
810afb2bd1cSJunchao Zhang        #endif
81157d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
81257d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
813aa372e3fSPaul Mullowney 
814aa372e3fSPaul Mullowney         /* set the operation */
815aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
816aa372e3fSPaul Mullowney 
817aa372e3fSPaul Mullowney         /* set the matrix */
818aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
819aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
821aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
824aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
825aa372e3fSPaul Mullowney 
826aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
827aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
828aa372e3fSPaul Mullowney 
829aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
830aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
831aa372e3fSPaul Mullowney 
832afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
833da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
834afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8351b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
836afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
837afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
838afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
839afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
840afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
841afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
842afb2bd1cSJunchao Zhang       #endif
843afb2bd1cSJunchao Zhang 
844aa372e3fSPaul Mullowney         /* perform the solve analysis */
845aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
846aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
847aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
848afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8491b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
850afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
851afb2bd1cSJunchao Zhang                                 #endif
852afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
853da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
854da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855aa372e3fSPaul Mullowney 
856da79fbbcSStefano Zampini         /* assign the pointer */
857aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
858087f3262SPaul Mullowney 
859da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
86057d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
86157d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
862da79fbbcSStefano Zampini       } else {
863da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
864da79fbbcSStefano Zampini         offset = 0;
865da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
866da79fbbcSStefano Zampini           /* set the pointers */
867da79fbbcSStefano Zampini           v  = aa + ai[i];
868da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
869da79fbbcSStefano Zampini 
870da79fbbcSStefano Zampini           /* first, set the diagonal elements */
871da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
872da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
873da79fbbcSStefano Zampini 
874da79fbbcSStefano Zampini           offset+=1;
875da79fbbcSStefano Zampini           if (nz>0) {
876da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
877da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
878da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
879da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
880da79fbbcSStefano Zampini             }
881da79fbbcSStefano Zampini             offset+=nz;
882da79fbbcSStefano Zampini           }
883da79fbbcSStefano Zampini         }
884da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
886da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
887da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
888da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
889da79fbbcSStefano Zampini       }
89057d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
89157d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
892087f3262SPaul Mullowney     } catch(char *ex) {
893087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
894087f3262SPaul Mullowney     }
895087f3262SPaul Mullowney   }
896087f3262SPaul Mullowney   PetscFunctionReturn(0);
897087f3262SPaul Mullowney }
898087f3262SPaul Mullowney 
899087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9009ae82921SPaul Mullowney {
9019ae82921SPaul Mullowney   PetscErrorCode               ierr;
902087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
903087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
904087f3262SPaul Mullowney   IS                           ip = a->row;
905087f3262SPaul Mullowney   PetscBool                    perm_identity;
906087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
907087f3262SPaul Mullowney 
908087f3262SPaul Mullowney   PetscFunctionBegin;
909da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
910087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
911da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
912aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
913aa372e3fSPaul Mullowney 
914da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
915da79fbbcSStefano Zampini 
916087f3262SPaul Mullowney   /* lower triangular indices */
917087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
918087f3262SPaul Mullowney   if (!perm_identity) {
9194e4bbfaaSStefano Zampini     IS             iip;
920da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9214e4bbfaaSStefano Zampini 
9224e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9234e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
924da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
925aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
926aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
927aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9284e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9294e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9304e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
931087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
932da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
933da79fbbcSStefano Zampini   }
934087f3262SPaul Mullowney   PetscFunctionReturn(0);
935087f3262SPaul Mullowney }
936087f3262SPaul Mullowney 
9376fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9389ae82921SPaul Mullowney {
9399ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9409ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9419ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
942b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9439ae82921SPaul Mullowney 
9449ae82921SPaul Mullowney   PetscFunctionBegin;
94557181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9469ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
947ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
948e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9499ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9509ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
951bda325fcSPaul Mullowney   if (row_identity && col_identity) {
952bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
953bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9544e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9554e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
956bda325fcSPaul Mullowney   } else {
957bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
958bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9594e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9604e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
961bda325fcSPaul Mullowney   }
9628dc1d2a3SPaul Mullowney 
963e057df02SPaul Mullowney   /* get the triangular factors */
964087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9659ae82921SPaul Mullowney   PetscFunctionReturn(0);
9669ae82921SPaul Mullowney }
9679ae82921SPaul Mullowney 
968087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
969087f3262SPaul Mullowney {
970087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
971087f3262SPaul Mullowney   IS             ip = b->row;
972087f3262SPaul Mullowney   PetscBool      perm_identity;
973b175d8bbSPaul Mullowney   PetscErrorCode ierr;
974087f3262SPaul Mullowney 
975087f3262SPaul Mullowney   PetscFunctionBegin;
97657181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
977087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
978ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
979087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
980087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
981087f3262SPaul Mullowney   if (perm_identity) {
982087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
983087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9844e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9854e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
986087f3262SPaul Mullowney   } else {
987087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
988087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9894e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9904e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
991087f3262SPaul Mullowney   }
992087f3262SPaul Mullowney 
993087f3262SPaul Mullowney   /* get the triangular factors */
994087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
995087f3262SPaul Mullowney   PetscFunctionReturn(0);
996087f3262SPaul Mullowney }
9979ae82921SPaul Mullowney 
998b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
999bda325fcSPaul Mullowney {
1000bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1002aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1003da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1004da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1005bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1006aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1007aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1008aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1009aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10101b0a6780SStefano Zampini   cudaError_t                       cerr;
1011da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1012b175d8bbSPaul Mullowney 
1013bda325fcSPaul Mullowney   PetscFunctionBegin;
1014aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1015da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1016da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1017aa372e3fSPaul Mullowney 
1018aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1019aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1020aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1021aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1022aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1023aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1024aa372e3fSPaul Mullowney 
1025aa372e3fSPaul Mullowney   /* Create the matrix description */
102657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102757d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102857d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102957d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
103057d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1031aa372e3fSPaul Mullowney 
1032aa372e3fSPaul Mullowney   /* set the operation */
1033aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1034aa372e3fSPaul Mullowney 
1035aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1036aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1037afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1038afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1039aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1040afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1041afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1042afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1043aa372e3fSPaul Mullowney 
1044aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1045afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1046afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1047afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1048afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1049afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1050afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1051afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1052afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1053afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1054afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10551b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1056afb2bd1cSJunchao Zhang #endif
1057afb2bd1cSJunchao Zhang 
1058da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1059aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1060aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1061aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1062aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1063aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1064aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1065afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1066afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1067afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1068afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1069afb2bd1cSJunchao Zhang                         #else
1070afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1071afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1072afb2bd1cSJunchao Zhang                         #endif
1073afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1074da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1075da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1076aa372e3fSPaul Mullowney 
1077afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1078da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1079afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1081afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1085afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1086afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1087afb2bd1cSJunchao Zhang #endif
1088afb2bd1cSJunchao Zhang 
1089afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1090aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1092afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10941b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1095afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1096afb2bd1cSJunchao Zhang                           #endif
1097afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1098da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1099da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1100aa372e3fSPaul Mullowney 
1101da79fbbcSStefano Zampini   /* assign the pointer */
1102aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1103aa372e3fSPaul Mullowney 
1104aa372e3fSPaul Mullowney   /*********************************************/
1105aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1106aa372e3fSPaul Mullowney   /*********************************************/
1107aa372e3fSPaul Mullowney 
1108aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1109da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1110da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1111aa372e3fSPaul Mullowney 
1112aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1113aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1114aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1115aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1116aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1117aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1118aa372e3fSPaul Mullowney 
1119aa372e3fSPaul Mullowney   /* Create the matrix description */
112057d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112157d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112257d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112357d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112457d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1125aa372e3fSPaul Mullowney 
1126aa372e3fSPaul Mullowney   /* set the operation */
1127aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1128aa372e3fSPaul Mullowney 
1129aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1130aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1131afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1132afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1133aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1134afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1135afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1136afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1137aa372e3fSPaul Mullowney 
1138aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1139afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1141afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1142afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1143afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1144afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1145afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1146afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1147afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1148afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1149afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1150afb2bd1cSJunchao Zhang #endif
1151afb2bd1cSJunchao Zhang 
1152da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1153aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1154aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1155aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1156aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1157aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1158aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1159afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1160afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1161afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1162afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1163afb2bd1cSJunchao Zhang                         #else
1164afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1166afb2bd1cSJunchao Zhang                         #endif
1167afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1168da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1169da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1170aa372e3fSPaul Mullowney 
1171afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1172da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1173afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11741b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1175afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1176afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1177afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1178afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1179afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1180afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1181afb2bd1cSJunchao Zhang   #endif
1182afb2bd1cSJunchao Zhang 
1183afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1184aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1185afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1186afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1187afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11881b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1189afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1190afb2bd1cSJunchao Zhang                           #endif
1191afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1192da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1193da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1194aa372e3fSPaul Mullowney 
1195da79fbbcSStefano Zampini   /* assign the pointer */
1196aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1197bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1198bda325fcSPaul Mullowney }
1199bda325fcSPaul Mullowney 
1200b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1201bda325fcSPaul Mullowney {
1202aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1204aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1205bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1206bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1207aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1208b06137fdSPaul Mullowney   cudaError_t                  err;
120985ba7357SStefano Zampini   PetscErrorCode               ierr;
1210b175d8bbSPaul Mullowney 
1211bda325fcSPaul Mullowney   PetscFunctionBegin;
1212fcdce8c4SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
121385ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
121485ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
121585ba7357SStefano Zampini   /* create cusparse matrix */
1216aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
121757d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1218aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
121957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
122057d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1221aa372e3fSPaul Mullowney 
1222b06137fdSPaul Mullowney   /* set alpha and beta */
1223afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12247656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12257656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1226afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12277656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12287656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
122957d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1230b06137fdSPaul Mullowney 
1231aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1232aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1233aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1234554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1235554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1236aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1237a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1240a3fdcf43SKarl Rupp 
1241039c6fbaSStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
124281902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243afb2bd1cSJunchao Zhang 
124481902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1245afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1246afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1247afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1248afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1249afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1250afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1251afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1252afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1253afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1254afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1255afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1256afb2bd1cSJunchao Zhang    #endif
1257afb2bd1cSJunchao Zhang 
1258a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1259a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1260aa372e3fSPaul Mullowney                             matrix->values->data().get(),
126181902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1262aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1263aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1264afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1265afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1266afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1267afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1268afb2bd1cSJunchao Zhang                           #else
1269afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1270afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1271afb2bd1cSJunchao Zhang                           #endif
1272afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1273aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1274afb2bd1cSJunchao Zhang 
1275afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1277afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1278afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1279afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1280afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1281afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1282afb2bd1cSJunchao Zhang    #endif
1283aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1284afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1285afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1286afb2bd1cSJunchao Zhang    #else
1287aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
128851c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
128951c6d536SStefano Zampini     /* First convert HYB to CSR */
1290aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1291aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1292aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1293aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1294aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1295aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1296aa372e3fSPaul Mullowney 
1297aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1298aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1299aa372e3fSPaul Mullowney                             temp->values->data().get(),
1300aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
130157d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1302aa372e3fSPaul Mullowney 
1303aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1305aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1306aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1307aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1308aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1310aa372e3fSPaul Mullowney 
1311aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1312aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1313aa372e3fSPaul Mullowney                             temp->values->data().get(),
1314aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1315aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1316aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1317aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1318aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
131957d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1320aa372e3fSPaul Mullowney 
1321aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1322aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
132357d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1324aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1325aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1326aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1327aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1328aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1329aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
133057d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1331aa372e3fSPaul Mullowney 
1332aa372e3fSPaul Mullowney     /* assign the pointer */
1333aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1334aa372e3fSPaul Mullowney     /* delete temporaries */
1335aa372e3fSPaul Mullowney     if (tempT) {
1336aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1337aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1338aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1339aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1340087f3262SPaul Mullowney     }
1341aa372e3fSPaul Mullowney     if (temp) {
1342aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1343aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1344aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1345aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1346aa372e3fSPaul Mullowney     }
1347afb2bd1cSJunchao Zhang    #endif
1348aa372e3fSPaul Mullowney   }
134905035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
135085ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
135185ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1352213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1353213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1354aa372e3fSPaul Mullowney   /* assign the pointer */
1355aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1356bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1357bda325fcSPaul Mullowney }
1358bda325fcSPaul Mullowney 
13594e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13606fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1361bda325fcSPaul Mullowney {
1362c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1363465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1364465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1365465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1366465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1367bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1368bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1369aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1370aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1371aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1372b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
137357d48284SJunchao Zhang   cudaError_t                           cerr;
1374bda325fcSPaul Mullowney 
1375bda325fcSPaul Mullowney   PetscFunctionBegin;
1376aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1377aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1378bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1379aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1380aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1381bda325fcSPaul Mullowney   }
1382bda325fcSPaul Mullowney 
1383bda325fcSPaul Mullowney   /* Get the GPU pointers */
1384c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1385c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1386c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1387c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1388bda325fcSPaul Mullowney 
13897a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1390aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1391c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1392c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1393c41cb2e2SAlejandro Lamas Daviña                xGPU);
1394aa372e3fSPaul Mullowney 
1395aa372e3fSPaul Mullowney   /* First, solve U */
1396aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1397afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13981b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1399afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1400afb2bd1cSJunchao Zhang                       #endif
1401afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1402aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1403aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1404aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1405aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1406afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14071b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1408afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1409afb2bd1cSJunchao Zhang                       #endif
1410afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1411aa372e3fSPaul Mullowney 
1412aa372e3fSPaul Mullowney   /* Then, solve L */
1413aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1414afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14151b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1416afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1417afb2bd1cSJunchao Zhang                       #endif
1418afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1419aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1420aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1421aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1422aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1423afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14241b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1425afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1426afb2bd1cSJunchao Zhang                       #endif
1427afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1428aa372e3fSPaul Mullowney 
1429aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1430c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1431c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1432aa372e3fSPaul Mullowney                tempGPU->begin());
1433aa372e3fSPaul Mullowney 
1434aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1435c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1436bda325fcSPaul Mullowney 
1437bda325fcSPaul Mullowney   /* restore */
1438c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1439c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
144005035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1441661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1442958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1443bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1444bda325fcSPaul Mullowney }
1445bda325fcSPaul Mullowney 
14466fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1447bda325fcSPaul Mullowney {
1448465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1449465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1450bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1451bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1452aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1453aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1454aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1455b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
145657d48284SJunchao Zhang   cudaError_t                       cerr;
1457bda325fcSPaul Mullowney 
1458bda325fcSPaul Mullowney   PetscFunctionBegin;
1459aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1460aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1461bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1462aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1463aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1464bda325fcSPaul Mullowney   }
1465bda325fcSPaul Mullowney 
1466bda325fcSPaul Mullowney   /* Get the GPU pointers */
1467c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1468c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1469bda325fcSPaul Mullowney 
14707a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1471aa372e3fSPaul Mullowney   /* First, solve U */
1472aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1473afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14741b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1476afb2bd1cSJunchao Zhang                       #endif
1477afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1478aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1479aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1480aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1481aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1482afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14831b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1485afb2bd1cSJunchao Zhang                       #endif
1486afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1487aa372e3fSPaul Mullowney 
1488aa372e3fSPaul Mullowney   /* Then, solve L */
1489aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1490afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14911b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1492afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1493afb2bd1cSJunchao Zhang                       #endif
1494afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1495aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1496aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1497aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1498aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1499afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15001b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1501afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1502afb2bd1cSJunchao Zhang                       #endif
1503afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1504bda325fcSPaul Mullowney 
1505bda325fcSPaul Mullowney   /* restore */
1506c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1507c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
150805035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1509661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1510958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1511bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1512bda325fcSPaul Mullowney }
1513bda325fcSPaul Mullowney 
15146fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15159ae82921SPaul Mullowney {
1516465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1517465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1518465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1519465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15209ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15219ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1522aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1523aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1524aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1525b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
152657d48284SJunchao Zhang   cudaError_t                           cerr;
15279ae82921SPaul Mullowney 
15289ae82921SPaul Mullowney   PetscFunctionBegin;
1529ebc8f436SDominic Meiser 
1530e057df02SPaul Mullowney   /* Get the GPU pointers */
1531c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1532c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1533c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1534c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15359ae82921SPaul Mullowney 
15367a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1537aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1538c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1539c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15404e4bbfaaSStefano Zampini                tempGPU->begin());
1541aa372e3fSPaul Mullowney 
1542aa372e3fSPaul Mullowney   /* Next, solve L */
1543aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1544afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15451b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1547afb2bd1cSJunchao Zhang                       #endif
1548afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1549aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1550aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1551aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1552aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1553afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15541b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1556afb2bd1cSJunchao Zhang                       #endif
1557afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1558aa372e3fSPaul Mullowney 
1559aa372e3fSPaul Mullowney   /* Then, solve U */
1560aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1561afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15621b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1564afb2bd1cSJunchao Zhang                       #endif
1565afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1566aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1567aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1568aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1569aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1570afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15711b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1572afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1573afb2bd1cSJunchao Zhang                       #endif
1574afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1575aa372e3fSPaul Mullowney 
15764e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15774e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15784e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15794e4bbfaaSStefano Zampini                xGPU);
15809ae82921SPaul Mullowney 
1581c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1582c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
158305035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1584661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15869ae82921SPaul Mullowney   PetscFunctionReturn(0);
15879ae82921SPaul Mullowney }
15889ae82921SPaul Mullowney 
15896fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15909ae82921SPaul Mullowney {
1591465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1592465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15939ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15949ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1595aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1596aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1597aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1598b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
159957d48284SJunchao Zhang   cudaError_t                       cerr;
16009ae82921SPaul Mullowney 
16019ae82921SPaul Mullowney   PetscFunctionBegin;
1602e057df02SPaul Mullowney   /* Get the GPU pointers */
1603c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1604c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16059ae82921SPaul Mullowney 
16067a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1607aa372e3fSPaul Mullowney   /* First, solve L */
1608aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1609afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16101b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1612afb2bd1cSJunchao Zhang                       #endif
1613afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1614aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1615aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1616aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1617aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1618afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16191b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1621afb2bd1cSJunchao Zhang                       #endif
1622afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1623aa372e3fSPaul Mullowney 
1624aa372e3fSPaul Mullowney   /* Next, solve U */
1625aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1626afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16271b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1629afb2bd1cSJunchao Zhang                       #endif
1630afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1631aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1632aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1633aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1634aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1635afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16361b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1638afb2bd1cSJunchao Zhang                       #endif
1639afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16409ae82921SPaul Mullowney 
1641c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1642c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
164305035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1644661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16469ae82921SPaul Mullowney   PetscFunctionReturn(0);
16479ae82921SPaul Mullowney }
16489ae82921SPaul Mullowney 
16497e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16507e8381f9SStefano Zampini {
16517e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16527e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16537e8381f9SStefano Zampini   cudaError_t        cerr;
16547e8381f9SStefano Zampini   PetscErrorCode     ierr;
16557e8381f9SStefano Zampini 
16567e8381f9SStefano Zampini   PetscFunctionBegin;
16577e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16587e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16597e8381f9SStefano Zampini 
16607e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16617e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16627e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16637e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16647e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16657e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16667e8381f9SStefano Zampini   }
16677e8381f9SStefano Zampini   PetscFunctionReturn(0);
16687e8381f9SStefano Zampini }
16697e8381f9SStefano Zampini 
16707e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16717e8381f9SStefano Zampini {
16727e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16737e8381f9SStefano Zampini   PetscErrorCode ierr;
16747e8381f9SStefano Zampini 
16757e8381f9SStefano Zampini   PetscFunctionBegin;
16767e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16777e8381f9SStefano Zampini   *array = a->a;
16787e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16797e8381f9SStefano Zampini   PetscFunctionReturn(0);
16807e8381f9SStefano Zampini }
16817e8381f9SStefano Zampini 
16826fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16839ae82921SPaul Mullowney {
1684aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16857c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16869ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1687213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16889ae82921SPaul Mullowney   PetscErrorCode               ierr;
1689aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1690abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1691b06137fdSPaul Mullowney   cudaError_t                  err;
16929ae82921SPaul Mullowney 
16939ae82921SPaul Mullowney   PetscFunctionBegin;
1694fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1695c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1696fcdce8c4SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
169781902715SJunchao Zhang       /* Copy values only */
1698afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1699afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
170085ba7357SStefano Zampini 
1701abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
170285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1703afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
170405035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17054863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
170685ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
170781902715SJunchao Zhang 
170881902715SJunchao Zhang       /* Update matT when it was built before */
170981902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
171081902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1711afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
171285ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
171381902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1714afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1715afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
171681902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1717afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1718afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1719afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1720afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1721afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1722afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1723afb2bd1cSJunchao Zhang                           #else
1724afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1725afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1726afb2bd1cSJunchao Zhang                           #endif
1727afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
172805035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
172985ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
173081902715SJunchao Zhang       }
173134d6c7a5SJose E. Roman     } else {
1732abb89eb1SStefano Zampini       PetscInt nnz;
173385ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17347c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17357c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17367c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
173781902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17389ae82921SPaul Mullowney       try {
17399ae82921SPaul Mullowney         if (a->compressedrow.use) {
17409ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17419ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17429ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17439ae82921SPaul Mullowney         } else {
1744213423ffSJunchao Zhang           m    = A->rmap->n;
1745213423ffSJunchao Zhang           ii   = a->i;
1746e6e9a74fSStefano Zampini           ridx = NULL;
17479ae82921SPaul Mullowney         }
1748abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1749abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1750abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1751abb89eb1SStefano Zampini         else nnz = a->nz;
17529ae82921SPaul Mullowney 
175385ba7357SStefano Zampini         /* create cusparse matrix */
1754abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1755aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
175657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
175757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
175857d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17599ae82921SPaul Mullowney 
1760afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17617656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17627656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1763afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17647656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17657656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
176657d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1767b06137fdSPaul Mullowney 
1768aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1769aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1770aa372e3fSPaul Mullowney           /* set the matrix */
1771afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1772afb2bd1cSJunchao Zhang           mat->num_rows = m;
1773afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1774abb89eb1SStefano Zampini           mat->num_entries = nnz;
1775afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1776afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17779ae82921SPaul Mullowney 
1778abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1779abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1780aa372e3fSPaul Mullowney 
1781abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1782abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1783aa372e3fSPaul Mullowney 
1784aa372e3fSPaul Mullowney           /* assign the pointer */
1785afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1786afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1787afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1788afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1789afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1790afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1791afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1792afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1793afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1794afb2bd1cSJunchao Zhang           }
1795afb2bd1cSJunchao Zhang          #endif
1796aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1797afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1798afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1799afb2bd1cSJunchao Zhang          #else
1800afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1801afb2bd1cSJunchao Zhang           mat->num_rows = m;
1802afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1803abb89eb1SStefano Zampini           mat->num_entries = nnz;
1804afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1805afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1806aa372e3fSPaul Mullowney 
1807abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1808abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1809aa372e3fSPaul Mullowney 
1810abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1811abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1812aa372e3fSPaul Mullowney 
1813aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
181457d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1815aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1816aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1817afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1818afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1819afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1820afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
182157d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1822aa372e3fSPaul Mullowney           /* assign the pointer */
1823aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1824aa372e3fSPaul Mullowney 
1825afb2bd1cSJunchao Zhang           if (mat) {
1826afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1827afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1828afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1829afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1830087f3262SPaul Mullowney           }
1831afb2bd1cSJunchao Zhang          #endif
1832087f3262SPaul Mullowney         }
1833ca45077fSPaul Mullowney 
1834aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1835213423ffSJunchao Zhang         if (a->compressedrow.use) {
1836213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1837aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1838aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1839213423ffSJunchao Zhang           tmp = m;
1840213423ffSJunchao Zhang         } else {
1841213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1842213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1843213423ffSJunchao Zhang           tmp = 0;
1844213423ffSJunchao Zhang         }
1845213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1846aa372e3fSPaul Mullowney 
1847aa372e3fSPaul Mullowney         /* assign the pointer */
1848aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18499ae82921SPaul Mullowney       } catch(char *ex) {
18509ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18519ae82921SPaul Mullowney       }
185205035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
185385ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
185434d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
185534d6c7a5SJose E. Roman     }
1856abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18579ae82921SPaul Mullowney   }
18589ae82921SPaul Mullowney   PetscFunctionReturn(0);
18599ae82921SPaul Mullowney }
18609ae82921SPaul Mullowney 
1861c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1862aa372e3fSPaul Mullowney {
1863aa372e3fSPaul Mullowney   template <typename Tuple>
1864aa372e3fSPaul Mullowney   __host__ __device__
1865aa372e3fSPaul Mullowney   void operator()(Tuple t)
1866aa372e3fSPaul Mullowney   {
1867aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1868aa372e3fSPaul Mullowney   }
1869aa372e3fSPaul Mullowney };
1870aa372e3fSPaul Mullowney 
18717e8381f9SStefano Zampini struct VecCUDAEquals
18727e8381f9SStefano Zampini {
18737e8381f9SStefano Zampini   template <typename Tuple>
18747e8381f9SStefano Zampini   __host__ __device__
18757e8381f9SStefano Zampini   void operator()(Tuple t)
18767e8381f9SStefano Zampini   {
18777e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18787e8381f9SStefano Zampini   }
18797e8381f9SStefano Zampini };
18807e8381f9SStefano Zampini 
1881e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1882e6e9a74fSStefano Zampini {
1883e6e9a74fSStefano Zampini   template <typename Tuple>
1884e6e9a74fSStefano Zampini   __host__ __device__
1885e6e9a74fSStefano Zampini   void operator()(Tuple t)
1886e6e9a74fSStefano Zampini   {
1887e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1888e6e9a74fSStefano Zampini   }
1889e6e9a74fSStefano Zampini };
1890e6e9a74fSStefano Zampini 
1891afb2bd1cSJunchao Zhang struct MatMatCusparse {
1892ccdfe979SStefano Zampini   PetscBool             cisdense;
1893ccdfe979SStefano Zampini   PetscScalar           *Bt;
1894ccdfe979SStefano Zampini   Mat                   X;
1895fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1896fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1897fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1898afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1899fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1900afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1901afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1902afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1903afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1904fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1905fcdce8c4SStefano Zampini   void                  *mmBuffer;
1906fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1907fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1908afb2bd1cSJunchao Zhang #endif
1909afb2bd1cSJunchao Zhang };
1910ccdfe979SStefano Zampini 
1911ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1912ccdfe979SStefano Zampini {
1913ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1914ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1915ccdfe979SStefano Zampini   cudaError_t      cerr;
1916fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1918fcdce8c4SStefano Zampini  #endif
1919ccdfe979SStefano Zampini 
1920ccdfe979SStefano Zampini   PetscFunctionBegin;
1921ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1922fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1923afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1924fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1925fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1926fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1927afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1928afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1929fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1930afb2bd1cSJunchao Zhang  #endif
1931ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1932ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1933ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1934ccdfe979SStefano Zampini }
1935ccdfe979SStefano Zampini 
1936ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1937ccdfe979SStefano Zampini 
1938ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1939ccdfe979SStefano Zampini {
1940ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1941ccdfe979SStefano Zampini   Mat                          A,B;
1942afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1943ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1944ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1945ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1946ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1947ccdfe979SStefano Zampini   const PetscScalar            *barray;
1948ccdfe979SStefano Zampini   PetscScalar                  *carray;
1949ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1950ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1951ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1952ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1953afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1954ccdfe979SStefano Zampini 
1955ccdfe979SStefano Zampini   PetscFunctionBegin;
1956ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1957ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1958ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1959ccdfe979SStefano Zampini   A    = product->A;
1960ccdfe979SStefano Zampini   B    = product->B;
1961ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1962ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1963ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1964ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1965ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1966ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1967ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1968ccdfe979SStefano Zampini   switch (product->type) {
1969ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1970ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1971ccdfe979SStefano Zampini     mat = cusp->mat;
1972ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1973ccdfe979SStefano Zampini     m   = A->rmap->n;
1974ccdfe979SStefano Zampini     n   = B->cmap->n;
1975ccdfe979SStefano Zampini     break;
1976ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1977e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1978e6e9a74fSStefano Zampini       mat = cusp->mat;
1979e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1980e6e9a74fSStefano Zampini     } else {
1981ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1982ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1983ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1984e6e9a74fSStefano Zampini     }
1985ccdfe979SStefano Zampini     m = A->cmap->n;
1986ccdfe979SStefano Zampini     n = B->cmap->n;
1987ccdfe979SStefano Zampini     break;
1988ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1989ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1990ccdfe979SStefano Zampini     mat = cusp->mat;
1991ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1992ccdfe979SStefano Zampini     m   = A->rmap->n;
1993ccdfe979SStefano Zampini     n   = B->rmap->n;
1994ccdfe979SStefano Zampini     break;
1995ccdfe979SStefano Zampini   default:
1996ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1997ccdfe979SStefano Zampini   }
1998ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1999ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2000ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2001ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2002afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2003ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2004afb2bd1cSJunchao Zhang 
2005ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2006c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2007c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2008c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2009c8378d12SStefano Zampini   } else {
2010c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2011c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2012c8378d12SStefano Zampini   }
2013c8378d12SStefano Zampini 
2014c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2015afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2016afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2017fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2018afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2019fcdce8c4SStefano Zampini     size_t mmBufferSize;
2020afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2021afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2022afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2023afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2024afb2bd1cSJunchao Zhang     }
2025c8378d12SStefano Zampini 
2026afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2027afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2028afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2029afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2030afb2bd1cSJunchao Zhang     }
2031afb2bd1cSJunchao Zhang 
2032afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2033afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2034afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2035afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2036afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2037afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2038afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2039afb2bd1cSJunchao Zhang     }
2040afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2041afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2042afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2043fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2044fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2045fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2046fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2047fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2048fcdce8c4SStefano Zampini     }
2049afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2050afb2bd1cSJunchao Zhang   } else {
2051afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2052afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2053afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2054afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2055afb2bd1cSJunchao Zhang   }
2056afb2bd1cSJunchao Zhang 
2057afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2058afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2059afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2060afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2061fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2062afb2bd1cSJunchao Zhang  #else
2063afb2bd1cSJunchao Zhang   PetscInt k;
2064afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2065ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2066ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2067ccdfe979SStefano Zampini     cublasStatus_t cerr;
2068ccdfe979SStefano Zampini 
2069ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2070ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2071ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2072ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2073ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2074ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2075ccdfe979SStefano Zampini     blda = B->cmap->n;
2076afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2077afb2bd1cSJunchao Zhang   } else {
2078afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2079ccdfe979SStefano Zampini   }
2080ccdfe979SStefano Zampini 
2081afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2082ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2083afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2084ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2085ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2086ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2087ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2088ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2089afb2bd1cSJunchao Zhang  #endif
2090afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2091c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2092c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2093ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2094ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2095ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2096ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2097ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2098ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2099ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2100ccdfe979SStefano Zampini   } else {
2101ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2102ccdfe979SStefano Zampini   }
2103ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2104ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2105ccdfe979SStefano Zampini   }
2106ccdfe979SStefano Zampini   if (!biscuda) {
2107ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2108ccdfe979SStefano Zampini   }
2109ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2110ccdfe979SStefano Zampini }
2111ccdfe979SStefano Zampini 
2112ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2113ccdfe979SStefano Zampini {
2114ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2115ccdfe979SStefano Zampini   Mat                A,B;
2116ccdfe979SStefano Zampini   PetscInt           m,n;
2117ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2118ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2119ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2120ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2121ccdfe979SStefano Zampini 
2122ccdfe979SStefano Zampini   PetscFunctionBegin;
2123ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2124ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2125ccdfe979SStefano Zampini   A    = product->A;
2126ccdfe979SStefano Zampini   B    = product->B;
2127ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2128ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2129ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2130ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2131ccdfe979SStefano Zampini   switch (product->type) {
2132ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2133ccdfe979SStefano Zampini     m = A->rmap->n;
2134ccdfe979SStefano Zampini     n = B->cmap->n;
2135ccdfe979SStefano Zampini     break;
2136ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2137ccdfe979SStefano Zampini     m = A->cmap->n;
2138ccdfe979SStefano Zampini     n = B->cmap->n;
2139ccdfe979SStefano Zampini     break;
2140ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2141ccdfe979SStefano Zampini     m = A->rmap->n;
2142ccdfe979SStefano Zampini     n = B->rmap->n;
2143ccdfe979SStefano Zampini     break;
2144ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2145ccdfe979SStefano Zampini     m = B->cmap->n;
2146ccdfe979SStefano Zampini     n = B->cmap->n;
2147ccdfe979SStefano Zampini     break;
2148ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2149ccdfe979SStefano Zampini     m = B->rmap->n;
2150ccdfe979SStefano Zampini     n = B->rmap->n;
2151ccdfe979SStefano Zampini     break;
2152ccdfe979SStefano Zampini   default:
2153ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2154ccdfe979SStefano Zampini   }
2155ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2156ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2157ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2158ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2159ccdfe979SStefano Zampini 
2160ccdfe979SStefano Zampini   /* product data */
2161ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2162ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2163afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2164afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2165ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2166afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2167ccdfe979SStefano Zampini   }
2168afb2bd1cSJunchao Zhang  #endif
2169ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2170ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2171ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2172ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2173ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2174ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2175ccdfe979SStefano Zampini     } else {
2176ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2177ccdfe979SStefano Zampini     }
2178ccdfe979SStefano Zampini   }
2179ccdfe979SStefano Zampini   C->product->data    = mmdata;
2180ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2181ccdfe979SStefano Zampini 
2182ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2183ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2184ccdfe979SStefano Zampini }
2185ccdfe979SStefano Zampini 
2186fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2187ccdfe979SStefano Zampini {
2188ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2189fcdce8c4SStefano Zampini   Mat                          A,B;
2190fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2191fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2192fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2193fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2194fcdce8c4SStefano Zampini   PetscBool                    flg;
2195ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2196fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2197fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2198fcdce8c4SStefano Zampini   MatProductType               ptype;
2199fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2200fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2202fcdce8c4SStefano Zampini #endif
2203ccdfe979SStefano Zampini 
2204ccdfe979SStefano Zampini   PetscFunctionBegin;
2205ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2206fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2207fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2208fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2209fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2210fcdce8c4SStefano Zampini   A = product->A;
2211fcdce8c4SStefano Zampini   B = product->B;
2212fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2213fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2214fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2215fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2216fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2217fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2218fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2219fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2220fcdce8c4SStefano Zampini     goto finalize;
2221fcdce8c4SStefano Zampini   }
2222fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2223fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2224fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2225fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2226fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2227fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2228fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2229fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2230fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2231fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2232fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2233fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2234fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2235fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2236fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2237fcdce8c4SStefano Zampini 
2238fcdce8c4SStefano Zampini   ptype = product->type;
2239fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2240fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2241fcdce8c4SStefano Zampini   switch (ptype) {
2242fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2243fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2244fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2245fcdce8c4SStefano Zampini     break;
2246fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2247fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2248fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2249fcdce8c4SStefano Zampini     break;
2250fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2251fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2252fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2253fcdce8c4SStefano Zampini     break;
2254fcdce8c4SStefano Zampini   default:
2255fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2256fcdce8c4SStefano Zampini   }
2257fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2258fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2259fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2260fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2261fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2262fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2263fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2264fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2265fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2266fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2267fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2268fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2269fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2270fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2271fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2272fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2273fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2274fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2275fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2276fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2277fcdce8c4SStefano Zampini #else
2278fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2279fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2280fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2281fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2282fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2283fcdce8c4SStefano Zampini #endif
2284fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2285fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2286fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2287fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2288fcdce8c4SStefano Zampini finalize:
2289fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2290fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2291fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2292fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2293fcdce8c4SStefano Zampini   c->reallocs         = 0;
2294fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2295fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2296fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2297fcdce8c4SStefano Zampini   C->num_ass++;
2298ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2299ccdfe979SStefano Zampini }
2300fcdce8c4SStefano Zampini 
2301fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2302fcdce8c4SStefano Zampini {
2303fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2304fcdce8c4SStefano Zampini   Mat                          A,B;
2305fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2306fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2307fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2308fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2309fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2310fcdce8c4SStefano Zampini   PetscBool                    flg;
2311fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2312fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2313fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2314fcdce8c4SStefano Zampini   MatProductType               ptype;
2315fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2316fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2317fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2318fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2319fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2320fcdce8c4SStefano Zampini   size_t                       bufSize2;
2321fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2322fcdce8c4SStefano Zampini #else
2323fcdce8c4SStefano Zampini   int                          cnz;
2324fcdce8c4SStefano Zampini #endif
2325fcdce8c4SStefano Zampini 
2326fcdce8c4SStefano Zampini   PetscFunctionBegin;
2327fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2328fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2329fcdce8c4SStefano Zampini   A    = product->A;
2330fcdce8c4SStefano Zampini   B    = product->B;
2331fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2332fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2333fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2334fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2335fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2336fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2337fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2338fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2339fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2340fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2341fcdce8c4SStefano Zampini 
2342fcdce8c4SStefano Zampini   /* product data */
2343fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2344fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2345fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2346fcdce8c4SStefano Zampini 
2347fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2348fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2349fcdce8c4SStefano Zampini   ptype = product->type;
2350fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2351fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2352fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2353fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2354fcdce8c4SStefano Zampini   switch (ptype) {
2355fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2356fcdce8c4SStefano Zampini     m = A->rmap->n;
2357fcdce8c4SStefano Zampini     n = B->cmap->n;
2358fcdce8c4SStefano Zampini     k = A->cmap->n;
2359fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2360fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2361fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2362fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2363fcdce8c4SStefano Zampini     break;
2364fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2365fcdce8c4SStefano Zampini     m = A->cmap->n;
2366fcdce8c4SStefano Zampini     n = B->cmap->n;
2367fcdce8c4SStefano Zampini     k = A->rmap->n;
2368fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2369fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2370fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2371fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2372fcdce8c4SStefano Zampini     break;
2373fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2374fcdce8c4SStefano Zampini     m = A->rmap->n;
2375fcdce8c4SStefano Zampini     n = B->rmap->n;
2376fcdce8c4SStefano Zampini     k = A->cmap->n;
2377fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2378fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2379fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2380fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2381fcdce8c4SStefano Zampini     break;
2382fcdce8c4SStefano Zampini   default:
2383fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2384fcdce8c4SStefano Zampini   }
2385fcdce8c4SStefano Zampini 
2386fcdce8c4SStefano Zampini   /* create cusparse matrix */
2387fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2388fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2389fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2390fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2391fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2392fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2393fcdce8c4SStefano Zampini 
2394fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2395fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2396fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2397fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2398fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2399fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2400fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2401fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2402fcdce8c4SStefano Zampini   } else {
2403fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2404fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2405fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2406fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2407fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2408fcdce8c4SStefano Zampini   }
2409fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2410fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2411fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2412fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2413fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2414fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2415fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2416fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2417fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2418fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2419fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2420fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2421fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2422fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2423fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2424fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2425fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2426fcdce8c4SStefano Zampini     c->nz = 0;
2427fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2428fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2429fcdce8c4SStefano Zampini     goto finalizesym;
2430fcdce8c4SStefano Zampini   }
2431fcdce8c4SStefano Zampini 
2432fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2433fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2434fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2435fcdce8c4SStefano Zampini   if (!biscompressed) {
2436fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2437fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2438fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2439fcdce8c4SStefano Zampini #endif
2440fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2441fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2442fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2443fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2444fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2445fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2446fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2447fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2448fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2449fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2450fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2451fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2452fcdce8c4SStefano Zampini     }
2453fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2454fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2455fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2456fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2457fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2458fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2459fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2460fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2461fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2462fcdce8c4SStefano Zampini     }
2463fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2464fcdce8c4SStefano Zampini #endif
2465fcdce8c4SStefano Zampini   }
2466fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2467fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2468fcdce8c4SStefano Zampini   /* precompute flops count */
2469fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2470fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2471fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2472fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2473fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2474fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2475fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2476fcdce8c4SStefano Zampini       }
2477fcdce8c4SStefano Zampini     }
2478fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2479fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2480fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2481fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2482fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2483fcdce8c4SStefano Zampini     }
2484fcdce8c4SStefano Zampini   } else { /* TODO */
2485fcdce8c4SStefano Zampini     flops = 0.;
2486fcdce8c4SStefano Zampini   }
2487fcdce8c4SStefano Zampini 
2488fcdce8c4SStefano Zampini   mmdata->flops = flops;
2489fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2490fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2491fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2492fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2493fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2494fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2497fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2498fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2499fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2500fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2501fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2502bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2503fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2504fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2505fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2506fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2507fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2508fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2509fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2510fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2511fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2512fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2513fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2514fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2515fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2516fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2517fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2518bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2519fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2520fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2521fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2522fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2523fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2524fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2525fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2526fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
252700702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2528fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2529fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2530fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2531fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2532fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2533fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2534fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2535fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2536fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2537fcdce8c4SStefano Zampini #else
2538fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2539fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2540fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2541fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2542fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2543fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2544fcdce8c4SStefano Zampini   c->nz = cnz;
2545fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2546fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2547fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2548fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2549fcdce8c4SStefano Zampini 
2550fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2551fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2552fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2553fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2554fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2556fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2557fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2558fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2559fcdce8c4SStefano Zampini #endif
2560fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2561fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2562fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2563fcdce8c4SStefano Zampini finalizesym:
2564fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2565fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2566fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2567fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2568fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2569fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2570fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2571fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2572fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2573fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2574fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2575fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2576fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2577fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2578fcdce8c4SStefano Zampini   } else {
2579fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2580fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2581fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2582fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2583fcdce8c4SStefano Zampini   }
2584fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2585fcdce8c4SStefano Zampini     PetscInt r = 0;
2586fcdce8c4SStefano Zampini     c->i[0] = 0;
2587fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2588fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2589fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2590fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2591fcdce8c4SStefano Zampini     }
2592fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2593fcdce8c4SStefano Zampini   }
2594fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2595fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2596fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2597fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2598fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2599fcdce8c4SStefano Zampini   c->rmax = 0;
2600fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2601fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2602fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2603fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2604fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2605fcdce8c4SStefano Zampini   }
2606fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2607fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2608fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2609fcdce8c4SStefano Zampini 
2610fcdce8c4SStefano Zampini   C->nonzerostate++;
2611fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2612fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2613fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2614fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2615fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2616fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2617fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2618abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2619fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2620fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2621fcdce8c4SStefano Zampini   }
2622fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2623fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2624fcdce8c4SStefano Zampini }
2625fcdce8c4SStefano Zampini 
2626fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2627fcdce8c4SStefano Zampini 
2628fcdce8c4SStefano Zampini /* handles sparse or dense B */
2629fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2630fcdce8c4SStefano Zampini {
2631fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2632fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2633fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2634fcdce8c4SStefano Zampini 
2635fcdce8c4SStefano Zampini   PetscFunctionBegin;
2636fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2637fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2638abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2639fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2640fcdce8c4SStefano Zampini   }
2641fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2642fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2643fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2644fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2645fcdce8c4SStefano Zampini     }
2646fcdce8c4SStefano Zampini   }
2647fcdce8c4SStefano Zampini   if (isdense) {
2648ccdfe979SStefano Zampini     switch (product->type) {
2649ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2650ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2651ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2652ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2653ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2654fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2655fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2656fcdce8c4SStefano Zampini       } else {
2657fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2658fcdce8c4SStefano Zampini       }
2659fcdce8c4SStefano Zampini       break;
2660fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2661fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2662fcdce8c4SStefano Zampini       break;
2663ccdfe979SStefano Zampini     default:
2664ccdfe979SStefano Zampini       break;
2665ccdfe979SStefano Zampini     }
2666fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2667fcdce8c4SStefano Zampini     switch (product->type) {
2668fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2669fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2670fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2671fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2672fcdce8c4SStefano Zampini       break;
2673fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2674fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2675fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2676fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2677fcdce8c4SStefano Zampini       break;
2678fcdce8c4SStefano Zampini     default:
2679fcdce8c4SStefano Zampini       break;
2680fcdce8c4SStefano Zampini     }
2681fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2682fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2683fcdce8c4SStefano Zampini   }
2684ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2685ccdfe979SStefano Zampini }
2686ccdfe979SStefano Zampini 
26876fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
26889ae82921SPaul Mullowney {
2689b175d8bbSPaul Mullowney   PetscErrorCode ierr;
26909ae82921SPaul Mullowney 
26919ae82921SPaul Mullowney   PetscFunctionBegin;
2692e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2693e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2694e6e9a74fSStefano Zampini }
2695e6e9a74fSStefano Zampini 
2696e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2697e6e9a74fSStefano Zampini {
2698e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2699e6e9a74fSStefano Zampini 
2700e6e9a74fSStefano Zampini   PetscFunctionBegin;
2701e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2702e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2703e6e9a74fSStefano Zampini }
2704e6e9a74fSStefano Zampini 
2705e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2706e6e9a74fSStefano Zampini {
2707e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2708e6e9a74fSStefano Zampini 
2709e6e9a74fSStefano Zampini   PetscFunctionBegin;
2710e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2711e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2712e6e9a74fSStefano Zampini }
2713e6e9a74fSStefano Zampini 
2714e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2715e6e9a74fSStefano Zampini {
2716e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2717e6e9a74fSStefano Zampini 
2718e6e9a74fSStefano Zampini   PetscFunctionBegin;
2719e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27209ae82921SPaul Mullowney   PetscFunctionReturn(0);
27219ae82921SPaul Mullowney }
27229ae82921SPaul Mullowney 
27236fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2724ca45077fSPaul Mullowney {
2725b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2726ca45077fSPaul Mullowney 
2727ca45077fSPaul Mullowney   PetscFunctionBegin;
2728e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2729ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2730ca45077fSPaul Mullowney }
2731ca45077fSPaul Mullowney 
2732afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2733e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27349ae82921SPaul Mullowney {
27359ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2736aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27379ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2738e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2739b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
274057d48284SJunchao Zhang   cudaError_t                  cerr;
2741aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2742e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2743e6e9a74fSStefano Zampini   PetscBool                    compressed;
2744afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2745afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2746afb2bd1cSJunchao Zhang #endif
27476e111a19SKarl Rupp 
27489ae82921SPaul Mullowney   PetscFunctionBegin;
2749e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2750e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2751afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2752d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2753e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2754e6e9a74fSStefano Zampini   }
275534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
275634d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2757e6e9a74fSStefano Zampini   if (!trans) {
27589ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2759c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2760e6e9a74fSStefano Zampini   } else {
2761e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2762e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2763e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2764e6e9a74fSStefano Zampini     } else {
2765afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2766e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2767e6e9a74fSStefano Zampini     }
2768e6e9a74fSStefano Zampini   }
2769e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2770e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2771213423ffSJunchao Zhang 
2772e6e9a74fSStefano Zampini   try {
2773e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2774213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2775213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2776afb2bd1cSJunchao Zhang 
277785ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2778e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2779afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2780afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2781afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2782afb2bd1cSJunchao Zhang       */
2783e6e9a74fSStefano Zampini       xptr = xarray;
2784afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2785213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2786afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2787afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2788afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2789afb2bd1cSJunchao Zhang        */
2790afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2791afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2792afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2793afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2794afb2bd1cSJunchao Zhang       }
2795afb2bd1cSJunchao Zhang      #endif
2796e6e9a74fSStefano Zampini     } else {
2797afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2798afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2799afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2800afb2bd1cSJunchao Zhang        */
2801afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2802e6e9a74fSStefano Zampini       dptr = zarray;
2803e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2804afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2805e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2806e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2807e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2808e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2809e6e9a74fSStefano Zampini       }
2810afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2811afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2812afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2813afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2814afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2815afb2bd1cSJunchao Zhang       }
2816afb2bd1cSJunchao Zhang      #endif
2817e6e9a74fSStefano Zampini     }
28189ae82921SPaul Mullowney 
2819afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2820aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2821afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2822afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2823afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2824afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2825afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2826afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2827afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2828afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2829afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2830afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2831afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2832afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2833afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2834afb2bd1cSJunchao Zhang 
2835afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2836afb2bd1cSJunchao Zhang       } else {
2837afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2838afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2839afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2840afb2bd1cSJunchao Zhang       }
2841afb2bd1cSJunchao Zhang 
2842afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2843afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2844afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2845afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2846afb2bd1cSJunchao Zhang                                beta,
2847afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2848afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2849afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2850afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2851afb2bd1cSJunchao Zhang      #else
28527656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2853e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2854a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2855afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2856aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2857e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
285857d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2859afb2bd1cSJunchao Zhang      #endif
2860aa372e3fSPaul Mullowney     } else {
2861213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2862afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2863afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2864afb2bd1cSJunchao Zhang        #else
2865301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2866e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2867afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2868e6e9a74fSStefano Zampini                                  xptr, beta,
286957d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2870afb2bd1cSJunchao Zhang        #endif
2871a65300a6SPaul Mullowney       }
2872aa372e3fSPaul Mullowney     }
287305035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2874958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2875aa372e3fSPaul Mullowney 
2876e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2877213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2878213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2879213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2880e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2881213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
28827656d835SStefano Zampini         }
2883213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2884c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
28857656d835SStefano Zampini       }
28867656d835SStefano Zampini 
2887213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2888213423ffSJunchao Zhang       if (compressed) {
2889213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2890e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2891c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2892e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2893c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
289405035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2895958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2896e6e9a74fSStefano Zampini       }
2897e6e9a74fSStefano Zampini     } else {
2898e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2899e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2900e6e9a74fSStefano Zampini       }
2901e6e9a74fSStefano Zampini     }
2902e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2903213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2904213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29059ae82921SPaul Mullowney   } catch(char *ex) {
29069ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29079ae82921SPaul Mullowney   }
2908e6e9a74fSStefano Zampini   if (yy) {
2909958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2910e6e9a74fSStefano Zampini   } else {
2911e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2912e6e9a74fSStefano Zampini   }
29139ae82921SPaul Mullowney   PetscFunctionReturn(0);
29149ae82921SPaul Mullowney }
29159ae82921SPaul Mullowney 
29166fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2917ca45077fSPaul Mullowney {
2918b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29196e111a19SKarl Rupp 
2920ca45077fSPaul Mullowney   PetscFunctionBegin;
2921e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2922ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2923ca45077fSPaul Mullowney }
2924ca45077fSPaul Mullowney 
29256fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29269ae82921SPaul Mullowney {
29279ae82921SPaul Mullowney   PetscErrorCode              ierr;
2928a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29299ae82921SPaul Mullowney   PetscFunctionBegin;
2930bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29313fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2932bc3f50f2SPaul Mullowney   }
29333fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29343fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2935a587d139SMark   if (d_mat) {
29363fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29373fa6b06aSMark Adams   }
29383fa6b06aSMark Adams 
29399ae82921SPaul Mullowney   PetscFunctionReturn(0);
29409ae82921SPaul Mullowney }
29419ae82921SPaul Mullowney 
29429ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2943e057df02SPaul Mullowney /*@
29449ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2945e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2946e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2947e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2948e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2949e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29509ae82921SPaul Mullowney 
2951d083f849SBarry Smith    Collective
29529ae82921SPaul Mullowney 
29539ae82921SPaul Mullowney    Input Parameters:
29549ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29559ae82921SPaul Mullowney .  m - number of rows
29569ae82921SPaul Mullowney .  n - number of columns
29579ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29589ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29590298fd71SBarry Smith          (possibly different for each row) or NULL
29609ae82921SPaul Mullowney 
29619ae82921SPaul Mullowney    Output Parameter:
29629ae82921SPaul Mullowney .  A - the matrix
29639ae82921SPaul Mullowney 
29649ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
29659ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
29669ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
29679ae82921SPaul Mullowney 
29689ae82921SPaul Mullowney    Notes:
29699ae82921SPaul Mullowney    If nnz is given then nz is ignored
29709ae82921SPaul Mullowney 
29719ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
29729ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
29739ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
29749ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
29759ae82921SPaul Mullowney 
29769ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
29770298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
29789ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
29799ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
29809ae82921SPaul Mullowney 
29819ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
29829ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
29839ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
29849ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
29859ae82921SPaul Mullowney 
29869ae82921SPaul Mullowney    Level: intermediate
29879ae82921SPaul Mullowney 
2988e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
29899ae82921SPaul Mullowney @*/
29909ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
29919ae82921SPaul Mullowney {
29929ae82921SPaul Mullowney   PetscErrorCode ierr;
29939ae82921SPaul Mullowney 
29949ae82921SPaul Mullowney   PetscFunctionBegin;
29959ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
29969ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
29979ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
29989ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
29999ae82921SPaul Mullowney   PetscFunctionReturn(0);
30009ae82921SPaul Mullowney }
30019ae82921SPaul Mullowney 
30026fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30039ae82921SPaul Mullowney {
30049ae82921SPaul Mullowney   PetscErrorCode              ierr;
30053fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3006ab25e6cbSDominic Meiser 
30079ae82921SPaul Mullowney   PetscFunctionBegin;
30089ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30093fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30103fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3011470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30129ae82921SPaul Mullowney   } else {
3013470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3014aa372e3fSPaul Mullowney   }
30153fa6b06aSMark Adams   if (d_mat) {
30163fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30173fa6b06aSMark Adams     cudaError_t                err;
30183fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30193fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30203fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30213fa6b06aSMark Adams     if (a->compressedrow.use) {
30223fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30233fa6b06aSMark Adams     }
30243fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30253fa6b06aSMark Adams   }
3026c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3027ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3028ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3029ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3030fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3031ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30327e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30337e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30349ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30359ae82921SPaul Mullowney   PetscFunctionReturn(0);
30369ae82921SPaul Mullowney }
30379ae82921SPaul Mullowney 
3038ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
303995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30409ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30419ff858a8SKarl Rupp {
30429ff858a8SKarl Rupp   PetscErrorCode ierr;
30439ff858a8SKarl Rupp 
30449ff858a8SKarl Rupp   PetscFunctionBegin;
30459ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3046ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30479ff858a8SKarl Rupp   PetscFunctionReturn(0);
30489ff858a8SKarl Rupp }
30499ff858a8SKarl Rupp 
3050039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
305195639643SRichard Tran Mills {
3052e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3053a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3054039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3055039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3056039c6fbaSStefano Zampini   PetscScalar        *ay;
3057039c6fbaSStefano Zampini   const PetscScalar  *ax;
3058039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3059039c6fbaSStefano Zampini   cudaError_t        cerr;
3060e6e9a74fSStefano Zampini 
306195639643SRichard Tran Mills   PetscFunctionBegin;
3062039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3063a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3064a587d139SMark     PetscFunctionReturn(0);
306595639643SRichard Tran Mills   }
3066039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3067a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3068a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3069039c6fbaSStefano Zampini   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3070039c6fbaSStefano Zampini   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3071039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3072039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3073039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3074039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3075039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3076039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3077039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3078039c6fbaSStefano Zampini     if (eq) {
3079039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3080039c6fbaSStefano Zampini     }
3081039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3082039c6fbaSStefano Zampini   }
3083*d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3084*d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3085039c6fbaSStefano Zampini 
3086039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3087039c6fbaSStefano Zampini     cusparseStatus_t stat;
3088039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3089039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3090039c6fbaSStefano Zampini     size_t           bufferSize;
3091039c6fbaSStefano Zampini     void             *buffer;
3092039c6fbaSStefano Zampini #endif
3093039c6fbaSStefano Zampini 
3094039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3095039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3096039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3097039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3098039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3099039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3100039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3101039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3102039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3103039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3104039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3105039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3106039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3107039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3108039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3109039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3110039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3111039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3112039c6fbaSStefano Zampini #else
3113039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3114039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3115039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3116039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3117039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3118039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3119039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3120039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3121039c6fbaSStefano Zampini #endif
3122039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3123039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3124039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3125039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3126039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3127a587d139SMark     cublasHandle_t cublasv2handle;
3128039c6fbaSStefano Zampini     cublasStatus_t berr;
3129a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3130039c6fbaSStefano Zampini 
3131039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3132039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3133a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3134a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3135a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3136039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3137039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3138a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3139a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3140039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3141039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3142a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3143039c6fbaSStefano Zampini   } else {
3144*d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3145a587d139SMark   }
314695639643SRichard Tran Mills   PetscFunctionReturn(0);
314795639643SRichard Tran Mills }
314895639643SRichard Tran Mills 
314933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
315033c9ba73SStefano Zampini {
315133c9ba73SStefano Zampini   PetscErrorCode ierr;
315233c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
315333c9ba73SStefano Zampini   PetscScalar    *ay;
315433c9ba73SStefano Zampini   cudaError_t    cerr;
315533c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
315633c9ba73SStefano Zampini   cublasStatus_t berr;
315733c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
315833c9ba73SStefano Zampini 
315933c9ba73SStefano Zampini   PetscFunctionBegin;
316033c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
316133c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
316233c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
316333c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
316433c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
316533c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
316633c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
316733c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
316833c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
316933c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
317033c9ba73SStefano Zampini   PetscFunctionReturn(0);
317133c9ba73SStefano Zampini }
317233c9ba73SStefano Zampini 
31733fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
31743fa6b06aSMark Adams {
31753fa6b06aSMark Adams   PetscErrorCode             ierr;
31767e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3177a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
31787e8381f9SStefano Zampini 
31793fa6b06aSMark Adams   PetscFunctionBegin;
31803fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
31813fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
31827e8381f9SStefano Zampini     if (spptr->mat) {
31837e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
31847e8381f9SStefano Zampini       if (matrix->values) {
31857e8381f9SStefano Zampini         both = PETSC_TRUE;
31867e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31877e8381f9SStefano Zampini       }
31887e8381f9SStefano Zampini     }
31897e8381f9SStefano Zampini     if (spptr->matTranspose) {
31907e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
31917e8381f9SStefano Zampini       if (matrix->values) {
31927e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31937e8381f9SStefano Zampini       }
31947e8381f9SStefano Zampini     }
31953fa6b06aSMark Adams   }
3196a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3197a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3198a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
31997e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3200a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32013fa6b06aSMark Adams 
32023fa6b06aSMark Adams   PetscFunctionReturn(0);
32033fa6b06aSMark Adams }
32043fa6b06aSMark Adams 
3205a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3206a587d139SMark {
3207a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3208a587d139SMark   PetscErrorCode ierr;
3209a587d139SMark 
3210a587d139SMark   PetscFunctionBegin;
3211a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3212a587d139SMark   if (flg) {
3213a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3214a587d139SMark 
321533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3216a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3217a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3218a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3219a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3220a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3221a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3222a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3223a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3224fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3225c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3226a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3227a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3228a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3229a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3230a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3231fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3232a587d139SMark   } else {
323333c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3234a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3235a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3236a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3237a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3238a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3239a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3240a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3241a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3242fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3243c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3244a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3245a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3246a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3247a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3248a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3249fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3250a587d139SMark   }
3251a587d139SMark   A->boundtocpu = flg;
3252a587d139SMark   a->inode.use = flg;
3253a587d139SMark   PetscFunctionReturn(0);
3254a587d139SMark }
3255a587d139SMark 
325649735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32579ae82921SPaul Mullowney {
32589ae82921SPaul Mullowney   PetscErrorCode   ierr;
3259aa372e3fSPaul Mullowney   cusparseStatus_t stat;
326049735bf3SStefano Zampini   Mat              B;
32619ae82921SPaul Mullowney 
32629ae82921SPaul Mullowney   PetscFunctionBegin;
3263832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
326449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
326549735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
326649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
326749735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
326849735bf3SStefano Zampini   }
326949735bf3SStefano Zampini   B = *newmat;
327049735bf3SStefano Zampini 
327134136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
327234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
327334136279SStefano Zampini 
327449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
32759ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3276e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3277e6e9a74fSStefano Zampini 
3278e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3279e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3280e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3281e6e9a74fSStefano Zampini       B->spptr = spptr;
32823fa6b06aSMark Adams       spptr->deviceMat = NULL;
32839ae82921SPaul Mullowney     } else {
3284e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3285e6e9a74fSStefano Zampini 
3286e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3287e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3288e6e9a74fSStefano Zampini       B->spptr = spptr;
32899ae82921SPaul Mullowney     }
3290e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
329149735bf3SStefano Zampini   }
3292693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
32939ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
32949ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
329595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3296693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
32972205254eSKarl Rupp 
3298e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
32999ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3300bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33019ae82921SPaul Mullowney   PetscFunctionReturn(0);
33029ae82921SPaul Mullowney }
33039ae82921SPaul Mullowney 
330402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
330502fe1965SBarry Smith {
330602fe1965SBarry Smith   PetscErrorCode ierr;
330702fe1965SBarry Smith 
330802fe1965SBarry Smith   PetscFunctionBegin;
330902fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33100ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3311afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3312afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3313afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
331402fe1965SBarry Smith   PetscFunctionReturn(0);
331502fe1965SBarry Smith }
331602fe1965SBarry Smith 
33173ca39a21SBarry Smith /*MC
3318e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3319e057df02SPaul Mullowney 
3320e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33212692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33222692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3323e057df02SPaul Mullowney 
3324e057df02SPaul Mullowney    Options Database Keys:
3325e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3326aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3327a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3328e057df02SPaul Mullowney 
3329e057df02SPaul Mullowney   Level: beginner
3330e057df02SPaul Mullowney 
33318468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3332e057df02SPaul Mullowney M*/
33337f756511SDominic Meiser 
333442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
333542c9c57cSBarry Smith 
33360f39cd5aSBarry Smith 
33373ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
333842c9c57cSBarry Smith {
333942c9c57cSBarry Smith   PetscErrorCode ierr;
334042c9c57cSBarry Smith 
334142c9c57cSBarry Smith   PetscFunctionBegin;
33423ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33433ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33443ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33453ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
334642c9c57cSBarry Smith   PetscFunctionReturn(0);
334742c9c57cSBarry Smith }
334829b38603SBarry Smith 
3349470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33507f756511SDominic Meiser {
3351e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33527f756511SDominic Meiser   cusparseStatus_t stat;
33537f756511SDominic Meiser 
33547f756511SDominic Meiser   PetscFunctionBegin;
33557f756511SDominic Meiser   if (*cusparsestruct) {
3356e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3357e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
33587f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
335981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
33607e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
33617e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
33627e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3363afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3364afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3365afb2bd1cSJunchao Zhang    #endif
3366e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
33677f756511SDominic Meiser   }
33687f756511SDominic Meiser   PetscFunctionReturn(0);
33697f756511SDominic Meiser }
33707f756511SDominic Meiser 
33717f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
33727f756511SDominic Meiser {
33737f756511SDominic Meiser   PetscFunctionBegin;
33747f756511SDominic Meiser   if (*mat) {
33757f756511SDominic Meiser     delete (*mat)->values;
33767f756511SDominic Meiser     delete (*mat)->column_indices;
33777f756511SDominic Meiser     delete (*mat)->row_offsets;
33787f756511SDominic Meiser     delete *mat;
33797f756511SDominic Meiser     *mat = 0;
33807f756511SDominic Meiser   }
33817f756511SDominic Meiser   PetscFunctionReturn(0);
33827f756511SDominic Meiser }
33837f756511SDominic Meiser 
3384470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
33857f756511SDominic Meiser {
33867f756511SDominic Meiser   cusparseStatus_t stat;
33877f756511SDominic Meiser   PetscErrorCode   ierr;
33887f756511SDominic Meiser 
33897f756511SDominic Meiser   PetscFunctionBegin;
33907f756511SDominic Meiser   if (*trifactor) {
339157d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3392afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
33937f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
33941b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
33952cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3396afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33971b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3398afb2bd1cSJunchao Zhang    #endif
3399da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34007f756511SDominic Meiser   }
34017f756511SDominic Meiser   PetscFunctionReturn(0);
34027f756511SDominic Meiser }
34037f756511SDominic Meiser 
3404470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34057f756511SDominic Meiser {
34067f756511SDominic Meiser   CsrMatrix        *mat;
34077f756511SDominic Meiser   cusparseStatus_t stat;
34087f756511SDominic Meiser   cudaError_t      err;
34097f756511SDominic Meiser 
34107f756511SDominic Meiser   PetscFunctionBegin;
34117f756511SDominic Meiser   if (*matstruct) {
34127f756511SDominic Meiser     if ((*matstruct)->mat) {
34137f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3414afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3415afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3416afb2bd1cSJunchao Zhang        #else
34177f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
341857d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3419afb2bd1cSJunchao Zhang        #endif
34207f756511SDominic Meiser       } else {
34217f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34227f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34237f756511SDominic Meiser       }
34247f756511SDominic Meiser     }
342557d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34267f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3427afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34287656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34297656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3430afb2bd1cSJunchao Zhang 
3431afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3432afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3433afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3434afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3435afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3436afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3437afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3438afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3439afb2bd1cSJunchao Zhang       }
3440afb2bd1cSJunchao Zhang     }
3441afb2bd1cSJunchao Zhang    #endif
34427f756511SDominic Meiser     delete *matstruct;
34437e8381f9SStefano Zampini     *matstruct = NULL;
34447f756511SDominic Meiser   }
34457f756511SDominic Meiser   PetscFunctionReturn(0);
34467f756511SDominic Meiser }
34477f756511SDominic Meiser 
3448ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34497f756511SDominic Meiser {
3450e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3451e6e9a74fSStefano Zampini 
34527f756511SDominic Meiser   PetscFunctionBegin;
34537f756511SDominic Meiser   if (*trifactors) {
3454e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3455e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3456e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3457e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
34587f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
34597f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
34607f756511SDominic Meiser     delete (*trifactors)->workVector;
34617e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
34627e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
34637e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3464ccdfe979SStefano Zampini   }
3465ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3466ccdfe979SStefano Zampini }
3467ccdfe979SStefano Zampini 
3468ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3469ccdfe979SStefano Zampini {
3470e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3471ccdfe979SStefano Zampini   cusparseHandle_t handle;
3472ccdfe979SStefano Zampini   cusparseStatus_t stat;
3473ccdfe979SStefano Zampini 
3474ccdfe979SStefano Zampini   PetscFunctionBegin;
3475ccdfe979SStefano Zampini   if (*trifactors) {
3476e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
34777f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
347857d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
34797f756511SDominic Meiser     }
3480e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
34817f756511SDominic Meiser   }
34827f756511SDominic Meiser   PetscFunctionReturn(0);
34837f756511SDominic Meiser }
34847e8381f9SStefano Zampini 
34857e8381f9SStefano Zampini struct IJCompare
34867e8381f9SStefano Zampini {
34877e8381f9SStefano Zampini   __host__ __device__
34887e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34897e8381f9SStefano Zampini   {
34907e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
34917e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
34927e8381f9SStefano Zampini     return false;
34937e8381f9SStefano Zampini   }
34947e8381f9SStefano Zampini };
34957e8381f9SStefano Zampini 
34967e8381f9SStefano Zampini struct IJEqual
34977e8381f9SStefano Zampini {
34987e8381f9SStefano Zampini   __host__ __device__
34997e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35007e8381f9SStefano Zampini   {
35017e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35027e8381f9SStefano Zampini     return true;
35037e8381f9SStefano Zampini   }
35047e8381f9SStefano Zampini };
35057e8381f9SStefano Zampini 
35067e8381f9SStefano Zampini struct IJDiff
35077e8381f9SStefano Zampini {
35087e8381f9SStefano Zampini   __host__ __device__
35097e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35107e8381f9SStefano Zampini   {
35117e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35127e8381f9SStefano Zampini   }
35137e8381f9SStefano Zampini };
35147e8381f9SStefano Zampini 
35157e8381f9SStefano Zampini struct IJSum
35167e8381f9SStefano Zampini {
35177e8381f9SStefano Zampini   __host__ __device__
35187e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35197e8381f9SStefano Zampini   {
35207e8381f9SStefano Zampini     return t1||t2;
35217e8381f9SStefano Zampini   }
35227e8381f9SStefano Zampini };
35237e8381f9SStefano Zampini 
35247e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3525e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35267e8381f9SStefano Zampini {
35277e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3528fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3529bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
353008391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35317e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35327e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35337e8381f9SStefano Zampini   cudaError_t                           cerr;
35347e8381f9SStefano Zampini   PetscInt                              n;
35357e8381f9SStefano Zampini 
35367e8381f9SStefano Zampini   PetscFunctionBegin;
35377e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35387e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35397e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35407e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35417e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35427e8381f9SStefano Zampini     PetscFunctionReturn(0);
35437e8381f9SStefano Zampini   }
35447e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35457e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3546e61fc153SStefano Zampini   if (!v) {
3547e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3548e61fc153SStefano Zampini     goto finalize;
35497e8381f9SStefano Zampini   }
3550e61fc153SStefano Zampini   n = cusp->cooPerm->size();
355108391a17SStefano Zampini   if (isCudaMem(v)) {
355208391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
355308391a17SStefano Zampini   } else {
3554e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3555e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
355608391a17SStefano Zampini     d_v = cooPerm_v->data();
3557e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
355808391a17SStefano Zampini   }
3559bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3560e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
35617e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3562bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
356308391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3564e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3565e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3566e61fc153SStefano Zampini       delete cooPerm_w;
35677e8381f9SStefano Zampini     } else {
356808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35697e8381f9SStefano Zampini                                                                 matrix->values->begin()));
357008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35717e8381f9SStefano Zampini                                                                 matrix->values->end()));
35727e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
35737e8381f9SStefano Zampini     }
35747e8381f9SStefano Zampini   } else {
3575e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
357608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3577e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
35787e8381f9SStefano Zampini     } else {
357908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35807e8381f9SStefano Zampini                                                                 matrix->values->begin()));
358108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35827e8381f9SStefano Zampini                                                                 matrix->values->end()));
35837e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
35847e8381f9SStefano Zampini     }
35857e8381f9SStefano Zampini   }
35867e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3587bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3588e61fc153SStefano Zampini finalize:
3589e61fc153SStefano Zampini   delete cooPerm_v;
35907e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3591e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3592fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3593fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3594fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3595fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3596fcdce8c4SStefano Zampini   a->reallocs         = 0;
3597fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3598fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3599fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3600fcdce8c4SStefano Zampini   A->num_ass++;
36017e8381f9SStefano Zampini   PetscFunctionReturn(0);
36027e8381f9SStefano Zampini }
36037e8381f9SStefano Zampini 
36047e8381f9SStefano Zampini #include <thrust/binary_search.h>
3605e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36067e8381f9SStefano Zampini {
36077e8381f9SStefano Zampini   PetscErrorCode     ierr;
36087e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36097e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36107e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36117e8381f9SStefano Zampini   cudaError_t        cerr;
36127e8381f9SStefano Zampini 
36137e8381f9SStefano Zampini   PetscFunctionBegin;
36147e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36157e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36167e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36177e8381f9SStefano Zampini   if (n != cooPerm_n) {
36187e8381f9SStefano Zampini     delete cusp->cooPerm;
36197e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36207e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36217e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
36227e8381f9SStefano Zampini   }
36237e8381f9SStefano Zampini   if (n) {
36247e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
36257e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
36267e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
36277e8381f9SStefano Zampini 
36287e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
36297e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36307e8381f9SStefano Zampini 
36317e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36327e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36337e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36347e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36357e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36367e8381f9SStefano Zampini 
363708391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36387e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36397e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
36407e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
36417e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
36427e8381f9SStefano Zampini 
36437e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
36447e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
36457e8381f9SStefano Zampini       delete cusp->cooPerm_a;
36467e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
36477e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
36487e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
36497e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
36507e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
36517e8381f9SStefano Zampini       w[0] = 0;
36527e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
36537e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
36547e8381f9SStefano Zampini     }
36557e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
36567e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
36577e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
36587e8381f9SStefano Zampini                         ii.begin());
365908391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
366008391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
36617e8381f9SStefano Zampini 
36627e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
36637e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
36647e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
36657e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
36667e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
36677e8381f9SStefano Zampini     a->i[0] = 0;
36687e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36697e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3670fcdce8c4SStefano Zampini     a->rmax = 0;
36717e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
36727e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
36737e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36747e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
36757e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
36767e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
36777e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
36787e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
36797e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3680fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
36817e8381f9SStefano Zampini     }
3682fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
36837e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
36847e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3685fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
36867e8381f9SStefano Zampini   } else {
36877e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
36887e8381f9SStefano Zampini   }
3689e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
36907e8381f9SStefano Zampini 
36917e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3692e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3693e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
36947e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
36957e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
36967e8381f9SStefano Zampini   A->nonzerostate++;
36977e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
36987e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
36997e8381f9SStefano Zampini 
37007e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37017e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37027e8381f9SStefano Zampini   PetscFunctionReturn(0);
37037e8381f9SStefano Zampini }
3704ed502f03SStefano Zampini 
3705ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3706ed502f03SStefano Zampini {
3707ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3708ed502f03SStefano Zampini   CsrMatrix          *csr;
3709ed502f03SStefano Zampini   PetscErrorCode     ierr;
3710ed502f03SStefano Zampini 
3711ed502f03SStefano Zampini   PetscFunctionBegin;
3712ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3713ed502f03SStefano Zampini   PetscValidPointer(a,2);
3714ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3715ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3716ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
371733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3718ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3719ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3720ed502f03SStefano Zampini   *a = csr->values->data().get();
3721ed502f03SStefano Zampini   PetscFunctionReturn(0);
3722ed502f03SStefano Zampini }
3723ed502f03SStefano Zampini 
3724ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3725ed502f03SStefano Zampini {
3726ed502f03SStefano Zampini   PetscFunctionBegin;
3727ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3728ed502f03SStefano Zampini   PetscValidPointer(a,2);
3729ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3730ed502f03SStefano Zampini   *a = NULL;
3731ed502f03SStefano Zampini   PetscFunctionReturn(0);
3732ed502f03SStefano Zampini }
3733ed502f03SStefano Zampini 
3734039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3735039c6fbaSStefano Zampini {
3736039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3737039c6fbaSStefano Zampini   CsrMatrix          *csr;
3738039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3739039c6fbaSStefano Zampini 
3740039c6fbaSStefano Zampini   PetscFunctionBegin;
3741039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3742039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3743039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3744039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3745039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
374633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3747039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3748039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3749039c6fbaSStefano Zampini   *a = csr->values->data().get();
3750039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3751039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3752039c6fbaSStefano Zampini }
3753039c6fbaSStefano Zampini 
3754039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3755039c6fbaSStefano Zampini {
3756039c6fbaSStefano Zampini   PetscErrorCode ierr;
3757039c6fbaSStefano Zampini 
3758039c6fbaSStefano Zampini   PetscFunctionBegin;
3759039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3760039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3761039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3762039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3763039c6fbaSStefano Zampini   *a = NULL;
3764039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3765039c6fbaSStefano Zampini }
3766039c6fbaSStefano Zampini 
3767ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3768ed502f03SStefano Zampini {
3769ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3770ed502f03SStefano Zampini   CsrMatrix          *csr;
3771ed502f03SStefano Zampini 
3772ed502f03SStefano Zampini   PetscFunctionBegin;
3773ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3774ed502f03SStefano Zampini   PetscValidPointer(a,2);
3775ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3776ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
377733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3778ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3779ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780ed502f03SStefano Zampini   *a = csr->values->data().get();
3781039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3782ed502f03SStefano Zampini   PetscFunctionReturn(0);
3783ed502f03SStefano Zampini }
3784ed502f03SStefano Zampini 
3785ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3786ed502f03SStefano Zampini {
3787ed502f03SStefano Zampini   PetscErrorCode ierr;
3788ed502f03SStefano Zampini 
3789ed502f03SStefano Zampini   PetscFunctionBegin;
3790ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3791ed502f03SStefano Zampini   PetscValidPointer(a,2);
3792ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3793ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3794ed502f03SStefano Zampini   *a = NULL;
3795ed502f03SStefano Zampini   PetscFunctionReturn(0);
3796ed502f03SStefano Zampini }
3797ed502f03SStefano Zampini 
3798ed502f03SStefano Zampini struct IJCompare4
3799ed502f03SStefano Zampini {
3800ed502f03SStefano Zampini   __host__ __device__
38012ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3802ed502f03SStefano Zampini   {
3803ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3804ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3805ed502f03SStefano Zampini     return false;
3806ed502f03SStefano Zampini   }
3807ed502f03SStefano Zampini };
3808ed502f03SStefano Zampini 
38098909a122SStefano Zampini struct Shift
38108909a122SStefano Zampini {
3811ed502f03SStefano Zampini   int _shift;
3812ed502f03SStefano Zampini 
3813ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3814ed502f03SStefano Zampini   __host__ __device__
3815ed502f03SStefano Zampini   inline int operator() (const int &c)
3816ed502f03SStefano Zampini   {
3817ed502f03SStefano Zampini     return c + _shift;
3818ed502f03SStefano Zampini   }
3819ed502f03SStefano Zampini };
3820ed502f03SStefano Zampini 
3821ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3822ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3823ed502f03SStefano Zampini {
3824ed502f03SStefano Zampini   PetscErrorCode               ierr;
3825ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3826ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3827ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3828ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3829ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3830ed502f03SStefano Zampini   cusparseStatus_t             stat;
3831ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3832ed502f03SStefano Zampini   cudaError_t                  cerr;
3833ed502f03SStefano Zampini 
3834ed502f03SStefano Zampini   PetscFunctionBegin;
3835ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3836ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3837ed502f03SStefano Zampini   PetscValidPointer(C,4);
3838ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3839ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3840ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3841ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3842ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3843ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3844ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3845ed502f03SStefano Zampini     m     = A->rmap->n;
3846ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3847ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3848ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3849ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3850ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3851ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3852ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3853ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3854ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3855ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3856ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3857ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3858ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3859ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3860ed502f03SStefano Zampini     Ccusp->nrows    = m;
3861ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3862ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3863ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3864ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3865ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3866ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3867ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3868ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3869ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3870ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3871ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3872ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3873ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3874ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3875ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3876ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3877ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3878ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3879ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3880ed502f03SStefano Zampini 
3881ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3882ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3883ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3884ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3885ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3886ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3887ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3888ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3889ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3890ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3891ed502f03SStefano Zampini     if (c->nz) {
38922ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
38932ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
38942ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
38952ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
38962ed87e7eSStefano Zampini 
3897ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3898ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3899ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3900ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3901ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3902ed502f03SStefano Zampini         }
39032ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39042ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3905ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3906ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3907ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3908ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3909ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3910ed502f03SStefano Zampini         }
39112ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39122ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3913ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39142ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39152ed87e7eSStefano Zampini                               Aroff->data().get(),
39162ed87e7eSStefano Zampini                               Annz,
39172ed87e7eSStefano Zampini                               m,
39182ed87e7eSStefano Zampini                               Acoo->data().get(),
39192ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3920ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
39212ed87e7eSStefano Zampini                               Broff->data().get(),
3922ed502f03SStefano Zampini                               Bnnz,
3923ed502f03SStefano Zampini                               m,
39242ed87e7eSStefano Zampini                               Bcoo->data().get(),
3925ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
39262ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
39272ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
39282ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
39298909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3930ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3931ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39328909a122SStefano Zampini #else
39338909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39348909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39358909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39368909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
39378909a122SStefano Zampini #endif
39382ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
39392ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
39402ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
39412ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
39422ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
39432ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3944ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
3945ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
3946ed502f03SStefano Zampini       thrust::advance(p2,Annz);
39472ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
39488909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
39498909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
39508909a122SStefano Zampini #endif
39512ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
39522ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
39532ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
39542ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
39552ed87e7eSStefano Zampini #else
39562ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
39572ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
39582ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
39592ed87e7eSStefano Zampini #endif
3960ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
39612ed87e7eSStefano Zampini                               Ccoo->data().get(),
3962ed502f03SStefano Zampini                               c->nz,
3963ed502f03SStefano Zampini                               m,
3964ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
3965ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3966ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3967ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
39682ed87e7eSStefano Zampini       delete wPerm;
39692ed87e7eSStefano Zampini       delete Acoo;
39702ed87e7eSStefano Zampini       delete Bcoo;
39712ed87e7eSStefano Zampini       delete Ccoo;
3972ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3973ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3974ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3975ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3976ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3977ed502f03SStefano Zampini #endif
3978ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3979ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3980ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3981ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
3982ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3983ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3984ed502f03SStefano Zampini 
3985ed502f03SStefano Zampini         Ccusp->transgen = PETSC_TRUE;
3986ed502f03SStefano Zampini         CmatT->cprowIndices  = NULL;
3987ed502f03SStefano Zampini         CmatT->mat = CcsrT;
3988ed502f03SStefano Zampini         CcsrT->num_rows = n;
3989ed502f03SStefano Zampini         CcsrT->num_cols = m;
3990ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
3991ed502f03SStefano Zampini 
3992ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3993ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3994ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
3995ed502f03SStefano Zampini 
3996ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3997ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
3998ed502f03SStefano Zampini         if (AT) {
3999ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4000ed502f03SStefano Zampini           thrust::advance(rT,-1);
4001ed502f03SStefano Zampini         }
4002ed502f03SStefano Zampini         if (BT) {
4003ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4004ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4005ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4006ed502f03SStefano Zampini         }
4007ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4008ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4009ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4010ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4011ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4012ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4013ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4014ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4015ed502f03SStefano Zampini 
4016ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4017ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4018ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4019ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4020ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4021ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4022ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4023ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4024ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4025ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4026ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4027ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4028ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4029ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4030ed502f03SStefano Zampini #endif
4031ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4032ed502f03SStefano Zampini       }
4033ed502f03SStefano Zampini     }
4034ed502f03SStefano Zampini 
4035ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4036ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4037ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4038ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4039ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4040ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4041ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4042ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4043ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4044ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4045ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4046ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4047ed502f03SStefano Zampini     } else {
4048ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4049ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4050ed502f03SStefano Zampini     }
4051ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4052ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4053ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4054ed502f03SStefano Zampini     c->maxnz = c->nz;
4055ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4056ed502f03SStefano Zampini     c->rmax = 0;
4057ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4058ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4059ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4060ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4061ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4062ed502f03SStefano Zampini     }
4063ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4064ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4065ed502f03SStefano Zampini     (*C)->nonzerostate++;
4066ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4067ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4068ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4069ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4070ed502f03SStefano Zampini   } else {
4071ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4072ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4073ed502f03SStefano Zampini     if (c->nz) {
4074ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4075ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4076ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4077ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4078ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4079ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4080ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4081ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4082ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4083ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4084ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4085ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4086ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4087ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4088ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4089ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4090ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4091ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4092ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4093ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4094ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4095ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4096ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4097ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4098ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4099ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4100ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4101ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4102ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4103ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4104ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4105ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4106ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4107ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4108ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4109ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4110ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4111ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4112ed502f03SStefano Zampini       }
4113ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4114ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4115ed502f03SStefano Zampini     }
4116ed502f03SStefano Zampini   }
4117ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4118ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4119ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4120ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4121ed502f03SStefano Zampini   PetscFunctionReturn(0);
4122ed502f03SStefano Zampini }
4123c215019aSStefano Zampini 
4124c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4125c215019aSStefano Zampini {
4126c215019aSStefano Zampini   PetscErrorCode    ierr;
4127c215019aSStefano Zampini   bool              dmem;
4128c215019aSStefano Zampini   const PetscScalar *av;
4129c215019aSStefano Zampini   cudaError_t       cerr;
4130c215019aSStefano Zampini 
4131c215019aSStefano Zampini   PetscFunctionBegin;
4132c215019aSStefano Zampini   dmem = isCudaMem(v);
4133c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4134c215019aSStefano Zampini   if (n && idx) {
4135c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4136c215019aSStefano Zampini     widx.assign(idx,idx+n);
4137c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4138c215019aSStefano Zampini 
4139c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4140c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4141c215019aSStefano Zampini     if (dmem) {
4142c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4143c215019aSStefano Zampini     } else {
4144c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4145c215019aSStefano Zampini       dv = w->data();
4146c215019aSStefano Zampini     }
4147c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4148c215019aSStefano Zampini 
4149c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4150c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4151c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4152c215019aSStefano Zampini     if (w) {
4153c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4154c215019aSStefano Zampini     }
4155c215019aSStefano Zampini     delete w;
4156c215019aSStefano Zampini   } else {
4157c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4158c215019aSStefano Zampini   }
4159c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4160c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4161c215019aSStefano Zampini   PetscFunctionReturn(0);
4162c215019aSStefano Zampini }
4163