xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 65e4b4d46e77b7a054a0705b877751ea429d3d2b)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18bddcd29dSMark Adams #include <cooperative_groups.h>
19bddcd29dSMark Adams #endif
20e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
24afb2bd1cSJunchao Zhang 
25afb2bd1cSJunchao Zhang   typedef enum {
26afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
27afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
28afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
29afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
30afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
31afb2bd1cSJunchao Zhang 
32afb2bd1cSJunchao Zhang   typedef enum {
33afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
35afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
37afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
45afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
46afb2bd1cSJunchao Zhang 
47afb2bd1cSJunchao Zhang   typedef enum {
48afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
50afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
51afb2bd1cSJunchao Zhang   */
52afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55afb2bd1cSJunchao Zhang #endif
569ae82921SPaul Mullowney 
57087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
60087f3262SPaul Mullowney 
61bddcd29dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
887f756511SDominic Meiser 
8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9257181aedSStefano Zampini 
937e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
947e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
957e8381f9SStefano Zampini 
96c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
97c215019aSStefano Zampini 
98b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99b06137fdSPaul Mullowney {
100b06137fdSPaul Mullowney   cusparseStatus_t   stat;
101b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102b06137fdSPaul Mullowney 
103b06137fdSPaul Mullowney   PetscFunctionBegin;
104d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10657d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107b06137fdSPaul Mullowney   PetscFunctionReturn(0);
108b06137fdSPaul Mullowney }
109b06137fdSPaul Mullowney 
110b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111b06137fdSPaul Mullowney {
112b06137fdSPaul Mullowney   cusparseStatus_t   stat;
113b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114b06137fdSPaul Mullowney 
115b06137fdSPaul Mullowney   PetscFunctionBegin;
116d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1176b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11816a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11957d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
12016a2e217SAlejandro Lamas Daviña     }
121b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1226b1cf21dSAlejandro Lamas Daviña   }
12357d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124b06137fdSPaul Mullowney   PetscFunctionReturn(0);
125b06137fdSPaul Mullowney }
126b06137fdSPaul Mullowney 
127b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128b06137fdSPaul Mullowney {
129b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1307e8381f9SStefano Zampini   PetscBool          flg;
1317e8381f9SStefano Zampini   PetscErrorCode     ierr;
132ccdfe979SStefano Zampini 
133b06137fdSPaul Mullowney   PetscFunctionBegin;
1347e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1357e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
136ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137b06137fdSPaul Mullowney   PetscFunctionReturn(0);
138b06137fdSPaul Mullowney }
139b06137fdSPaul Mullowney 
140ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1419ae82921SPaul Mullowney {
1429ae82921SPaul Mullowney   PetscFunctionBegin;
1439ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1449ae82921SPaul Mullowney   PetscFunctionReturn(0);
1459ae82921SPaul Mullowney }
1469ae82921SPaul Mullowney 
147c708e6cdSJed Brown /*MC
148087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
154c708e6cdSJed Brown 
1559ae82921SPaul Mullowney   Level: beginner
156c708e6cdSJed Brown 
1573ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158c708e6cdSJed Brown M*/
1599ae82921SPaul Mullowney 
16042c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1619ae82921SPaul Mullowney {
1629ae82921SPaul Mullowney   PetscErrorCode ierr;
163bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1649ae82921SPaul Mullowney 
1659ae82921SPaul Mullowney   PetscFunctionBegin;
166bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
167bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1682c7c0729SBarry Smith   (*B)->factortype = ftype;
1692c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1709ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1712205254eSKarl Rupp 
172087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
17333d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1749ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1759ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1799ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
180bc3f50f2SPaul Mullowney 
181fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1823ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1839ae82921SPaul Mullowney   PetscFunctionReturn(0);
1849ae82921SPaul Mullowney }
1859ae82921SPaul Mullowney 
186bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187ca45077fSPaul Mullowney {
188aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1896e111a19SKarl Rupp 
190ca45077fSPaul Mullowney   PetscFunctionBegin;
191ca45077fSPaul Mullowney   switch (op) {
192e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
193aa372e3fSPaul Mullowney     cusparsestruct->format = format;
194ca45077fSPaul Mullowney     break;
195e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
196aa372e3fSPaul Mullowney     cusparsestruct->format = format;
197ca45077fSPaul Mullowney     break;
198ca45077fSPaul Mullowney   default:
19936d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200ca45077fSPaul Mullowney   }
201ca45077fSPaul Mullowney   PetscFunctionReturn(0);
202ca45077fSPaul Mullowney }
2039ae82921SPaul Mullowney 
204e057df02SPaul Mullowney /*@
205e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
207aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
208e057df02SPaul Mullowney    Not Collective
209e057df02SPaul Mullowney 
210e057df02SPaul Mullowney    Input Parameters:
2118468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
21236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2132692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214e057df02SPaul Mullowney 
215e057df02SPaul Mullowney    Output Parameter:
216e057df02SPaul Mullowney 
217e057df02SPaul Mullowney    Level: intermediate
218e057df02SPaul Mullowney 
2198468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220e057df02SPaul Mullowney @*/
221e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222e057df02SPaul Mullowney {
223e057df02SPaul Mullowney   PetscErrorCode ierr;
2246e111a19SKarl Rupp 
225e057df02SPaul Mullowney   PetscFunctionBegin;
226e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
227e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
228e057df02SPaul Mullowney   PetscFunctionReturn(0);
229e057df02SPaul Mullowney }
230e057df02SPaul Mullowney 
2311a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232e6e9a74fSStefano Zampini {
233e6e9a74fSStefano Zampini   PetscErrorCode ierr;
234e6e9a74fSStefano Zampini 
235e6e9a74fSStefano Zampini   PetscFunctionBegin;
2361a2c6b5cSJunchao Zhang   switch (op) {
2371a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2381a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2391a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2401a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2411a2c6b5cSJunchao Zhang       break;
2421a2c6b5cSJunchao Zhang     default:
2431a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2441a2c6b5cSJunchao Zhang       break;
245e6e9a74fSStefano Zampini   }
246e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
247e6e9a74fSStefano Zampini }
248e6e9a74fSStefano Zampini 
249bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250bddcd29dSMark Adams 
251bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252bddcd29dSMark Adams {
253bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
255bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
256bddcd29dSMark Adams   PetscErrorCode ierr;
257bddcd29dSMark Adams 
258bddcd29dSMark Adams   PetscFunctionBegin;
259bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
260bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
261bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
262bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
263bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
264bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
265bddcd29dSMark Adams   if (row_identity && col_identity) {
266bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268bddcd29dSMark Adams     B->ops->matsolve = NULL;
269bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
270bddcd29dSMark Adams   } else {
271bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273bddcd29dSMark Adams     B->ops->matsolve = NULL;
274bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
275bddcd29dSMark Adams   }
276bddcd29dSMark Adams 
277bddcd29dSMark Adams   /* get the triangular factors */
278bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
279bddcd29dSMark Adams   PetscFunctionReturn(0);
280bddcd29dSMark Adams }
281bddcd29dSMark Adams 
2824416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2839ae82921SPaul Mullowney {
2849ae82921SPaul Mullowney   PetscErrorCode           ierr;
285e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2869ae82921SPaul Mullowney   PetscBool                flg;
287a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2886e111a19SKarl Rupp 
2899ae82921SPaul Mullowney   PetscFunctionBegin;
290e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2919ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
292e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
294afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
295afb2bd1cSJunchao Zhang 
2964c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
298afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
299afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
302afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304afb2bd1cSJunchao Zhang 
305afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
307afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
308afb2bd1cSJunchao Zhang 
309afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
311afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312afb2bd1cSJunchao Zhang    #endif
3134c87dfd4SPaul Mullowney   }
3140af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3159ae82921SPaul Mullowney   PetscFunctionReturn(0);
3169ae82921SPaul Mullowney }
3179ae82921SPaul Mullowney 
3186fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3199ae82921SPaul Mullowney {
320da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3219ae82921SPaul Mullowney   PetscErrorCode               ierr;
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney   PetscFunctionBegin;
324da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3259ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3269ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3279ae82921SPaul Mullowney   PetscFunctionReturn(0);
3289ae82921SPaul Mullowney }
3299ae82921SPaul Mullowney 
3306fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3319ae82921SPaul Mullowney {
332da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3339ae82921SPaul Mullowney   PetscErrorCode               ierr;
3349ae82921SPaul Mullowney 
3359ae82921SPaul Mullowney   PetscFunctionBegin;
336da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3379ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3399ae82921SPaul Mullowney   PetscFunctionReturn(0);
3409ae82921SPaul Mullowney }
3419ae82921SPaul Mullowney 
342087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343087f3262SPaul Mullowney {
344da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345087f3262SPaul Mullowney   PetscErrorCode               ierr;
346087f3262SPaul Mullowney 
347087f3262SPaul Mullowney   PetscFunctionBegin;
348da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
349087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
350087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351087f3262SPaul Mullowney   PetscFunctionReturn(0);
352087f3262SPaul Mullowney }
353087f3262SPaul Mullowney 
354087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355087f3262SPaul Mullowney {
356da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357087f3262SPaul Mullowney   PetscErrorCode               ierr;
358087f3262SPaul Mullowney 
359087f3262SPaul Mullowney   PetscFunctionBegin;
360da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
361087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
362087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363087f3262SPaul Mullowney   PetscFunctionReturn(0);
364087f3262SPaul Mullowney }
365087f3262SPaul Mullowney 
366087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3679ae82921SPaul Mullowney {
3689ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3699ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3709ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3729ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3739ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3749ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3759ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3769ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
377b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
37857d48284SJunchao Zhang   cudaError_t                       cerr;
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney   PetscFunctionBegin;
381cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
382c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3839ae82921SPaul Mullowney     try {
3849ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3859ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
386da79fbbcSStefano Zampini       if (!loTriFactor) {
3872cbc15d9SMark         PetscScalar                       *AALo;
3882cbc15d9SMark 
3892cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3909ae82921SPaul Mullowney 
3919ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
39257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
39357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3969ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3979ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3989ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3999ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4009ae82921SPaul Mullowney         v        = aa;
4019ae82921SPaul Mullowney         vi       = aj;
4029ae82921SPaul Mullowney         offset   = 1;
4039ae82921SPaul Mullowney         rowOffset= 1;
4049ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4059ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
406e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4079ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4089ae82921SPaul Mullowney           rowOffset += nz+1;
4099ae82921SPaul Mullowney 
410580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
411580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4129ae82921SPaul Mullowney 
4139ae82921SPaul Mullowney           offset      += nz;
4149ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4159ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4169ae82921SPaul Mullowney           offset      += 1;
4179ae82921SPaul Mullowney 
4189ae82921SPaul Mullowney           v  += nz;
4199ae82921SPaul Mullowney           vi += nz;
4209ae82921SPaul Mullowney         }
4212205254eSKarl Rupp 
422aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
423da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
424da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425aa372e3fSPaul Mullowney         /* Create the matrix description */
42657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
42757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4281b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430afb2bd1cSJunchao Zhang        #else
43157d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432afb2bd1cSJunchao Zhang        #endif
43357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
43457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
435aa372e3fSPaul Mullowney 
436aa372e3fSPaul Mullowney         /* set the operation */
437aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438aa372e3fSPaul Mullowney 
439aa372e3fSPaul Mullowney         /* set the matrix */
440aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
441aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
442aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
443aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
444aa372e3fSPaul Mullowney 
445aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
447aa372e3fSPaul Mullowney 
448aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
450aa372e3fSPaul Mullowney 
451aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
453aa372e3fSPaul Mullowney 
454afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
455da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
456afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4571b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464afb2bd1cSJunchao Zhang       #endif
465afb2bd1cSJunchao Zhang 
466aa372e3fSPaul Mullowney         /* perform the solve analysis */
467aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4711b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473afb2bd1cSJunchao Zhang                                #endif
474afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
475da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
477aa372e3fSPaul Mullowney 
478da79fbbcSStefano Zampini         /* assign the pointer */
479aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4802cbc15d9SMark         loTriFactor->AA_h = AALo;
48157d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
48257d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4834863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484da79fbbcSStefano Zampini       } else { /* update values only */
4852cbc15d9SMark         if (!loTriFactor->AA_h) {
4862cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4872cbc15d9SMark         }
488da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4892cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
490da79fbbcSStefano Zampini         v        = aa;
491da79fbbcSStefano Zampini         vi       = aj;
492da79fbbcSStefano Zampini         offset   = 1;
493da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
494da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4952cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
496da79fbbcSStefano Zampini           offset      += nz;
4972cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
498da79fbbcSStefano Zampini           offset      += 1;
499da79fbbcSStefano Zampini           v  += nz;
500da79fbbcSStefano Zampini         }
5012cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
503da79fbbcSStefano Zampini       }
5049ae82921SPaul Mullowney     } catch(char *ex) {
5059ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5069ae82921SPaul Mullowney     }
5079ae82921SPaul Mullowney   }
5089ae82921SPaul Mullowney   PetscFunctionReturn(0);
5099ae82921SPaul Mullowney }
5109ae82921SPaul Mullowney 
511087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5129ae82921SPaul Mullowney {
5139ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5149ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5159ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5179ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5189ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5199ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5209ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5219ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5229ae82921SPaul Mullowney   PetscErrorCode                    ierr;
52357d48284SJunchao Zhang   cudaError_t                       cerr;
5249ae82921SPaul Mullowney 
5259ae82921SPaul Mullowney   PetscFunctionBegin;
526cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
527c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5289ae82921SPaul Mullowney     try {
5299ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5309ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
531da79fbbcSStefano Zampini       if (!upTriFactor) {
5322cbc15d9SMark         PetscScalar *AAUp;
5332cbc15d9SMark 
5342cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5352cbc15d9SMark 
5369ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
53757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
53857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5399ae82921SPaul Mullowney 
5409ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5419ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5429ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5439ae82921SPaul Mullowney         offset = nzUpper;
5449ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5459ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5469ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5479ae82921SPaul Mullowney 
548e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5499ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5509ae82921SPaul Mullowney 
551e057df02SPaul Mullowney           /* decrement the offset */
5529ae82921SPaul Mullowney           offset -= (nz+1);
5539ae82921SPaul Mullowney 
554e057df02SPaul Mullowney           /* first, set the diagonal elements */
5559ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
55609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5579ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5589ae82921SPaul Mullowney 
559580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
560580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5619ae82921SPaul Mullowney         }
5622205254eSKarl Rupp 
563aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
564da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
565da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5662205254eSKarl Rupp 
567aa372e3fSPaul Mullowney         /* Create the matrix description */
56857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
56957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5701b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572afb2bd1cSJunchao Zhang        #else
57357d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574afb2bd1cSJunchao Zhang        #endif
57557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
57657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
577aa372e3fSPaul Mullowney 
578aa372e3fSPaul Mullowney         /* set the operation */
579aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580aa372e3fSPaul Mullowney 
581aa372e3fSPaul Mullowney         /* set the matrix */
582aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
583aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
584aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
585aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
586aa372e3fSPaul Mullowney 
587aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589aa372e3fSPaul Mullowney 
590aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595aa372e3fSPaul Mullowney 
596afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
597da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
598afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5991b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606afb2bd1cSJunchao Zhang       #endif
607afb2bd1cSJunchao Zhang 
608aa372e3fSPaul Mullowney         /* perform the solve analysis */
609aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
6131b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615afb2bd1cSJunchao Zhang                                #endif
616afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
617da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
619aa372e3fSPaul Mullowney 
620da79fbbcSStefano Zampini         /* assign the pointer */
621aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6222cbc15d9SMark         upTriFactor->AA_h = AAUp;
62357d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
62457d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6254863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
626da79fbbcSStefano Zampini       } else {
6272cbc15d9SMark         if (!upTriFactor->AA_h) {
6282cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6292cbc15d9SMark         }
630da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
631da79fbbcSStefano Zampini         offset = nzUpper;
632da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
633da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
634da79fbbcSStefano Zampini 
635da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
636da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
637da79fbbcSStefano Zampini 
638da79fbbcSStefano Zampini           /* decrement the offset */
639da79fbbcSStefano Zampini           offset -= (nz+1);
640da79fbbcSStefano Zampini 
641da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6422cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6432cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
644da79fbbcSStefano Zampini         }
6452cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
647da79fbbcSStefano Zampini       }
6489ae82921SPaul Mullowney     } catch(char *ex) {
6499ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6509ae82921SPaul Mullowney     }
6519ae82921SPaul Mullowney   }
6529ae82921SPaul Mullowney   PetscFunctionReturn(0);
6539ae82921SPaul Mullowney }
6549ae82921SPaul Mullowney 
655087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6569ae82921SPaul Mullowney {
6579ae82921SPaul Mullowney   PetscErrorCode               ierr;
6589ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6599ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6609ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6619ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6629ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6639ae82921SPaul Mullowney 
6649ae82921SPaul Mullowney   PetscFunctionBegin;
665da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
667087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6682205254eSKarl Rupp 
669da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6719ae82921SPaul Mullowney 
672c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
673e057df02SPaul Mullowney   /* lower triangular indices */
6749ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
675da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676da79fbbcSStefano Zampini     const PetscInt *r;
677da79fbbcSStefano Zampini 
678da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
679aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6819ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
682da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
683da79fbbcSStefano Zampini   }
6849ae82921SPaul Mullowney 
685e057df02SPaul Mullowney   /* upper triangular indices */
6869ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688da79fbbcSStefano Zampini     const PetscInt *c;
689da79fbbcSStefano Zampini 
690da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
691aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6939ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
694da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
695da79fbbcSStefano Zampini   }
6969ae82921SPaul Mullowney   PetscFunctionReturn(0);
6979ae82921SPaul Mullowney }
6989ae82921SPaul Mullowney 
699087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700087f3262SPaul Mullowney {
701087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705087f3262SPaul Mullowney   cusparseStatus_t                  stat;
706087f3262SPaul Mullowney   PetscErrorCode                    ierr;
70757d48284SJunchao Zhang   cudaError_t                       cerr;
708087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
709087f3262SPaul Mullowney   PetscScalar                       *AAUp;
710087f3262SPaul Mullowney   PetscScalar                       *AALo;
711087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
715087f3262SPaul Mullowney 
716087f3262SPaul Mullowney   PetscFunctionBegin;
717cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
718c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719087f3262SPaul Mullowney     try {
720da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
723087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
72457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
72557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
726087f3262SPaul Mullowney 
727087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
728087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
729087f3262SPaul Mullowney         AiUp[n]=nzUpper;
730087f3262SPaul Mullowney         offset = 0;
731087f3262SPaul Mullowney         for (i=0; i<n; i++) {
732087f3262SPaul Mullowney           /* set the pointers */
733087f3262SPaul Mullowney           v  = aa + ai[i];
734087f3262SPaul Mullowney           vj = aj + ai[i];
735087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
736087f3262SPaul Mullowney 
737087f3262SPaul Mullowney           /* first, set the diagonal elements */
738087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
73909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
740087f3262SPaul Mullowney           AiUp[i]      = offset;
74109f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
742087f3262SPaul Mullowney 
743087f3262SPaul Mullowney           offset+=1;
744087f3262SPaul Mullowney           if (nz>0) {
745f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
746580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
747087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
748087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
749087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
750087f3262SPaul Mullowney             }
751087f3262SPaul Mullowney             offset+=nz;
752087f3262SPaul Mullowney           }
753087f3262SPaul Mullowney         }
754087f3262SPaul Mullowney 
755aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
756da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
757da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
758087f3262SPaul Mullowney 
759aa372e3fSPaul Mullowney         /* Create the matrix description */
76057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
76157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7621b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764afb2bd1cSJunchao Zhang        #else
76557d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766afb2bd1cSJunchao Zhang        #endif
76757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
76857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
769087f3262SPaul Mullowney 
770aa372e3fSPaul Mullowney         /* set the matrix */
771aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
772aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
773aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
774aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
775aa372e3fSPaul Mullowney 
776aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
778aa372e3fSPaul Mullowney 
779aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
781aa372e3fSPaul Mullowney 
782aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
784aa372e3fSPaul Mullowney 
785afb2bd1cSJunchao Zhang         /* set the operation */
786afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
787afb2bd1cSJunchao Zhang 
788afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
789da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
790afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7911b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798afb2bd1cSJunchao Zhang       #endif
799afb2bd1cSJunchao Zhang 
800aa372e3fSPaul Mullowney         /* perform the solve analysis */
801aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
8051b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807afb2bd1cSJunchao Zhang                                 #endif
808afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
809da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
811aa372e3fSPaul Mullowney 
812da79fbbcSStefano Zampini         /* assign the pointer */
813aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814aa372e3fSPaul Mullowney 
815aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
816da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
817da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney         /* Create the matrix description */
82057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
82157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8221b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824afb2bd1cSJunchao Zhang        #else
82557d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826afb2bd1cSJunchao Zhang        #endif
82757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
82857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
829aa372e3fSPaul Mullowney 
830aa372e3fSPaul Mullowney         /* set the operation */
831aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832aa372e3fSPaul Mullowney 
833aa372e3fSPaul Mullowney         /* set the matrix */
834aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
835aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
836aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
837aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
838aa372e3fSPaul Mullowney 
839aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847aa372e3fSPaul Mullowney 
848afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
849da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
850afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8511b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858afb2bd1cSJunchao Zhang       #endif
859afb2bd1cSJunchao Zhang 
860aa372e3fSPaul Mullowney         /* perform the solve analysis */
861aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8651b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867afb2bd1cSJunchao Zhang                                 #endif
868afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
869da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
871aa372e3fSPaul Mullowney 
872da79fbbcSStefano Zampini         /* assign the pointer */
873aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
874087f3262SPaul Mullowney 
875da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
87657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
87757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878da79fbbcSStefano Zampini       } else {
879da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
880da79fbbcSStefano Zampini         offset = 0;
881da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
882da79fbbcSStefano Zampini           /* set the pointers */
883da79fbbcSStefano Zampini           v  = aa + ai[i];
884da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
885da79fbbcSStefano Zampini 
886da79fbbcSStefano Zampini           /* first, set the diagonal elements */
887da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
888da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
889da79fbbcSStefano Zampini 
890da79fbbcSStefano Zampini           offset+=1;
891da79fbbcSStefano Zampini           if (nz>0) {
892da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
893da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
894da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
895da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
896da79fbbcSStefano Zampini             }
897da79fbbcSStefano Zampini             offset+=nz;
898da79fbbcSStefano Zampini           }
899da79fbbcSStefano Zampini         }
900da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
905da79fbbcSStefano Zampini       }
90657d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
90757d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908087f3262SPaul Mullowney     } catch(char *ex) {
909087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910087f3262SPaul Mullowney     }
911087f3262SPaul Mullowney   }
912087f3262SPaul Mullowney   PetscFunctionReturn(0);
913087f3262SPaul Mullowney }
914087f3262SPaul Mullowney 
915087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9169ae82921SPaul Mullowney {
9179ae82921SPaul Mullowney   PetscErrorCode               ierr;
918087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920087f3262SPaul Mullowney   IS                           ip = a->row;
921087f3262SPaul Mullowney   PetscBool                    perm_identity;
922087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
923087f3262SPaul Mullowney 
924087f3262SPaul Mullowney   PetscFunctionBegin;
925da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
927da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
929aa372e3fSPaul Mullowney 
930da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
931da79fbbcSStefano Zampini 
932087f3262SPaul Mullowney   /* lower triangular indices */
933087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
934087f3262SPaul Mullowney   if (!perm_identity) {
9354e4bbfaaSStefano Zampini     IS             iip;
936da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9374e4bbfaaSStefano Zampini 
9384e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9394e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
940da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
941aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9444e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9454e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9464e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
947087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
948da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
949da79fbbcSStefano Zampini   }
950087f3262SPaul Mullowney   PetscFunctionReturn(0);
951087f3262SPaul Mullowney }
952087f3262SPaul Mullowney 
953bddcd29dSMark Adams #define CHECK_LAUNCH_ERROR()                                                             \
954bddcd29dSMark Adams do {                                                                                     \
955bddcd29dSMark Adams   /* Check synchronous errors, i.e. pre-launch */                                        \
956bddcd29dSMark Adams   cudaError_t err = cudaGetLastError();                                                  \
957bddcd29dSMark Adams   if (cudaSuccess != err) {                                                              \
958bddcd29dSMark Adams     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959bddcd29dSMark Adams   }                                                                                      \
960bddcd29dSMark Adams   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
961bddcd29dSMark Adams   err = cudaDeviceSynchronize();                                                         \
962bddcd29dSMark Adams   if (cudaSuccess != err) {                                                              \
963bddcd29dSMark Adams     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964bddcd29dSMark Adams   }                                                                                      \
965bddcd29dSMark Adams  } while (0)
9669ae82921SPaul Mullowney 
967087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968087f3262SPaul Mullowney {
969087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970087f3262SPaul Mullowney   IS             ip = b->row;
971087f3262SPaul Mullowney   PetscBool      perm_identity;
972b175d8bbSPaul Mullowney   PetscErrorCode ierr;
973087f3262SPaul Mullowney 
974087f3262SPaul Mullowney   PetscFunctionBegin;
97557181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
978087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
979087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980087f3262SPaul Mullowney   if (perm_identity) {
981087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9834e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9844e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
985087f3262SPaul Mullowney   } else {
986087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9884e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9894e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
990087f3262SPaul Mullowney   }
991087f3262SPaul Mullowney 
992087f3262SPaul Mullowney   /* get the triangular factors */
993087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994087f3262SPaul Mullowney   PetscFunctionReturn(0);
995087f3262SPaul Mullowney }
9969ae82921SPaul Mullowney 
997b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998bda325fcSPaul Mullowney {
999bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1005aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1006aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1007aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1008aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10091b0a6780SStefano Zampini   cudaError_t                       cerr;
1010da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1011b175d8bbSPaul Mullowney 
1012bda325fcSPaul Mullowney   PetscFunctionBegin;
1013aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1014da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1018aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1019aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023aa372e3fSPaul Mullowney 
1024aa372e3fSPaul Mullowney   /* Create the matrix description */
102557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102757d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030aa372e3fSPaul Mullowney 
1031aa372e3fSPaul Mullowney   /* set the operation */
1032aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033aa372e3fSPaul Mullowney 
1034aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1035aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1036afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042aa372e3fSPaul Mullowney 
1043aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1048afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1049afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1050afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1051afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10541b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055afb2bd1cSJunchao Zhang #endif
1056afb2bd1cSJunchao Zhang 
1057da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1061aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1062aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1063aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1064afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068afb2bd1cSJunchao Zhang                         #else
1069afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1071afb2bd1cSJunchao Zhang                         #endif
1072afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1073da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075aa372e3fSPaul Mullowney 
1076afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1077da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086afb2bd1cSJunchao Zhang #endif
1087afb2bd1cSJunchao Zhang 
1088afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1089aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10931b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095afb2bd1cSJunchao Zhang                           #endif
1096afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1097da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099aa372e3fSPaul Mullowney 
1100da79fbbcSStefano Zampini   /* assign the pointer */
1101aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102aa372e3fSPaul Mullowney 
1103aa372e3fSPaul Mullowney   /*********************************************/
1104aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1105aa372e3fSPaul Mullowney   /*********************************************/
1106aa372e3fSPaul Mullowney 
1107aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1108da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1112aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1113aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117aa372e3fSPaul Mullowney 
1118aa372e3fSPaul Mullowney   /* Create the matrix description */
111957d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112057d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112157d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112257d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112357d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124aa372e3fSPaul Mullowney 
1125aa372e3fSPaul Mullowney   /* set the operation */
1126aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127aa372e3fSPaul Mullowney 
1128aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1129aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1130afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136aa372e3fSPaul Mullowney 
1137aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1142afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1143afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1144afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1145afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149afb2bd1cSJunchao Zhang #endif
1150afb2bd1cSJunchao Zhang 
1151da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1155aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1156aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1157aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1158afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162afb2bd1cSJunchao Zhang                         #else
1163afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1165afb2bd1cSJunchao Zhang                         #endif
1166afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1167da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169aa372e3fSPaul Mullowney 
1170afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1171da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11731b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180afb2bd1cSJunchao Zhang   #endif
1181afb2bd1cSJunchao Zhang 
1182afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1183aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11871b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189afb2bd1cSJunchao Zhang                           #endif
1190afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1191da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193aa372e3fSPaul Mullowney 
1194da79fbbcSStefano Zampini   /* assign the pointer */
1195aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1197bda325fcSPaul Mullowney }
1198bda325fcSPaul Mullowney 
1199a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1200a49f1ed0SStefano Zampini {
1201a49f1ed0SStefano Zampini   __host__ __device__
1202a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1203a49f1ed0SStefano Zampini   {
1204a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1205a49f1ed0SStefano Zampini   }
1206a49f1ed0SStefano Zampini };
1207a49f1ed0SStefano Zampini 
12081a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209bda325fcSPaul Mullowney {
1210aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1214aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1215b06137fdSPaul Mullowney   cudaError_t                  err;
121685ba7357SStefano Zampini   PetscErrorCode               ierr;
1217b175d8bbSPaul Mullowney 
1218bda325fcSPaul Mullowney   PetscFunctionBegin;
12191a2c6b5cSJunchao Zhang   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1220a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1221a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222a49f1ed0SStefano Zampini   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
12241a2c6b5cSJunchao Zhang   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
12251a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
122685ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1227a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229a49f1ed0SStefano Zampini   }
1230a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123257d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123457d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
123557d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236aa372e3fSPaul Mullowney 
1237b06137fdSPaul Mullowney     /* set alpha and beta */
1238afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12397656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12407656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12427656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12437656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244b06137fdSPaul Mullowney 
1245aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1247a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1248554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1249554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1250aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1251a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1254a3fdcf43SKarl Rupp 
1255039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
125681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257afb2bd1cSJunchao Zhang 
1258afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1260afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1263afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265afb2bd1cSJunchao Zhang      #endif
1266aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269afb2bd1cSJunchao Zhang    #else
1270aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
127151c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
127251c6d536SStefano Zampini       /* First convert HYB to CSR */
1273aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1274aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1275aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1276aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1279aa372e3fSPaul Mullowney 
1280aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282aa372e3fSPaul Mullowney                               temp->values->data().get(),
1283aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
128457d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1285aa372e3fSPaul Mullowney 
1286aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1288aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1289aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1290aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1293aa372e3fSPaul Mullowney 
1294aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1296aa372e3fSPaul Mullowney                               temp->values->data().get(),
1297aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1298aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1299aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1300aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
130257d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1303aa372e3fSPaul Mullowney 
1304aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1305aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
130657d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1311aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1312aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
131357d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1314aa372e3fSPaul Mullowney 
1315aa372e3fSPaul Mullowney       /* assign the pointer */
1316aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13171a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1318aa372e3fSPaul Mullowney       /* delete temporaries */
1319aa372e3fSPaul Mullowney       if (tempT) {
1320aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1324087f3262SPaul Mullowney       }
1325aa372e3fSPaul Mullowney       if (temp) {
1326aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1330aa372e3fSPaul Mullowney       }
1331afb2bd1cSJunchao Zhang      #endif
1332aa372e3fSPaul Mullowney     }
1333a49f1ed0SStefano Zampini   }
1334a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337a49f1ed0SStefano Zampini     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338a49f1ed0SStefano Zampini     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339a49f1ed0SStefano Zampini     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340a49f1ed0SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341a49f1ed0SStefano Zampini     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342a49f1ed0SStefano Zampini     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343a49f1ed0SStefano Zampini     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344a49f1ed0SStefano Zampini     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1349a49f1ed0SStefano Zampini     }
1350a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1351a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1353a49f1ed0SStefano Zampini 
1354a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1357a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1358a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1360a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1361a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1363a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1364a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368a49f1ed0SStefano Zampini      #endif
1369a49f1ed0SStefano Zampini 
13701a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13711a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13721a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13731a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13741a2c6b5cSJunchao Zhang 
13751a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13761a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13771a2c6b5cSJunchao Zhang         */
13781a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13791a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13801a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13811a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13821a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1383a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1384a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13871a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388a49f1ed0SStefano Zampini                              #else
1389a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
13901a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391a49f1ed0SStefano Zampini                              #endif
13921a2c6b5cSJunchao Zhang       } else {
13931a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13941a2c6b5cSJunchao Zhang       }
13951a2c6b5cSJunchao Zhang 
1396a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400a49f1ed0SStefano Zampini      #endif
1401a49f1ed0SStefano Zampini     }
1402a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1405a49f1ed0SStefano Zampini   }
140685ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1407213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1408213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1409aa372e3fSPaul Mullowney   /* assign the pointer */
1410aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14111a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1412bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1413bda325fcSPaul Mullowney }
1414bda325fcSPaul Mullowney 
1415a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14166fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417bda325fcSPaul Mullowney {
1418c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1419465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1420465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1421465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1422465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1423bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1424bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
142957d48284SJunchao Zhang   cudaError_t                           cerr;
1430bda325fcSPaul Mullowney 
1431bda325fcSPaul Mullowney   PetscFunctionBegin;
1432aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1433aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1434bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1435aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437bda325fcSPaul Mullowney   }
1438bda325fcSPaul Mullowney 
1439bda325fcSPaul Mullowney   /* Get the GPU pointers */
1440c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1441c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1442c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1443c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1444bda325fcSPaul Mullowney 
14457a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1446aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1447a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449c41cb2e2SAlejandro Lamas Daviña                xGPU);
1450aa372e3fSPaul Mullowney 
1451aa372e3fSPaul Mullowney   /* First, solve U */
1452aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14541b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1456afb2bd1cSJunchao Zhang                       #endif
1457afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1459aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1460aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1461aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1462afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14631b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465afb2bd1cSJunchao Zhang                       #endif
1466afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1467aa372e3fSPaul Mullowney 
1468aa372e3fSPaul Mullowney   /* Then, solve L */
1469aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14711b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1473afb2bd1cSJunchao Zhang                       #endif
1474afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1476aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1477aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1478aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1479afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482afb2bd1cSJunchao Zhang                       #endif
1483afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1484aa372e3fSPaul Mullowney 
1485aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488aa372e3fSPaul Mullowney                tempGPU->begin());
1489aa372e3fSPaul Mullowney 
1490aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1491a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1492bda325fcSPaul Mullowney 
1493bda325fcSPaul Mullowney   /* restore */
1494c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1495c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
149605035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1498958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1499bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1500bda325fcSPaul Mullowney }
1501bda325fcSPaul Mullowney 
15026fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503bda325fcSPaul Mullowney {
1504465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1505465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1506bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1507bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
151257d48284SJunchao Zhang   cudaError_t                       cerr;
1513bda325fcSPaul Mullowney 
1514bda325fcSPaul Mullowney   PetscFunctionBegin;
1515aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1516aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1517bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520bda325fcSPaul Mullowney   }
1521bda325fcSPaul Mullowney 
1522bda325fcSPaul Mullowney   /* Get the GPU pointers */
1523c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525bda325fcSPaul Mullowney 
15267a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527aa372e3fSPaul Mullowney   /* First, solve U */
1528aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15301b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1532afb2bd1cSJunchao Zhang                       #endif
1533afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1535aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1536aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1537aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1538afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15391b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541afb2bd1cSJunchao Zhang                       #endif
1542afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1543aa372e3fSPaul Mullowney 
1544aa372e3fSPaul Mullowney   /* Then, solve L */
1545aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15471b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1549afb2bd1cSJunchao Zhang                       #endif
1550afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1552aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1553aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1554aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1555afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15561b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558afb2bd1cSJunchao Zhang                       #endif
1559afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1560bda325fcSPaul Mullowney 
1561bda325fcSPaul Mullowney   /* restore */
1562c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1563c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
156405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1566958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1567bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1568bda325fcSPaul Mullowney }
1569bda325fcSPaul Mullowney 
15706fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15719ae82921SPaul Mullowney {
1572465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1573465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1574465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1575465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15769ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
158257d48284SJunchao Zhang   cudaError_t                           cerr;
15839ae82921SPaul Mullowney 
15849ae82921SPaul Mullowney   PetscFunctionBegin;
1585ebc8f436SDominic Meiser 
1586e057df02SPaul Mullowney   /* Get the GPU pointers */
1587c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1588c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1589c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1590c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15919ae82921SPaul Mullowney 
15927a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1593aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1594a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15964e4bbfaaSStefano Zampini                tempGPU->begin());
1597aa372e3fSPaul Mullowney 
1598aa372e3fSPaul Mullowney   /* Next, solve L */
1599aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16011b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1603afb2bd1cSJunchao Zhang                       #endif
1604afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1606aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1607aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1608aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1609afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16101b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612afb2bd1cSJunchao Zhang                       #endif
1613afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1614aa372e3fSPaul Mullowney 
1615aa372e3fSPaul Mullowney   /* Then, solve U */
1616aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16181b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1620afb2bd1cSJunchao Zhang                       #endif
1621afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1623aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1624aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1625aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1626afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16271b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629afb2bd1cSJunchao Zhang                       #endif
1630afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1631aa372e3fSPaul Mullowney 
16324e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1633a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16344e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16354e4bbfaaSStefano Zampini                xGPU);
16369ae82921SPaul Mullowney 
1637c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1638c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
163905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1641958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16429ae82921SPaul Mullowney   PetscFunctionReturn(0);
16439ae82921SPaul Mullowney }
16449ae82921SPaul Mullowney 
16456fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16469ae82921SPaul Mullowney {
1647465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1648465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16499ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16509ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
165557d48284SJunchao Zhang   cudaError_t                       cerr;
16569ae82921SPaul Mullowney 
16579ae82921SPaul Mullowney   PetscFunctionBegin;
1658e057df02SPaul Mullowney   /* Get the GPU pointers */
1659c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1660c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16619ae82921SPaul Mullowney 
16627a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1663aa372e3fSPaul Mullowney   /* First, solve L */
1664aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16661b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1668afb2bd1cSJunchao Zhang                       #endif
1669afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1671aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1672aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1673aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1674afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16751b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677afb2bd1cSJunchao Zhang                       #endif
1678afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1679aa372e3fSPaul Mullowney 
1680aa372e3fSPaul Mullowney   /* Next, solve U */
1681aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16831b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1685afb2bd1cSJunchao Zhang                       #endif
1686afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1688aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1689aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1690aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1691afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16921b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694afb2bd1cSJunchao Zhang                       #endif
1695afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16969ae82921SPaul Mullowney 
1697c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1698c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
169905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1701958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17029ae82921SPaul Mullowney   PetscFunctionReturn(0);
17039ae82921SPaul Mullowney }
17049ae82921SPaul Mullowney 
17057e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17067e8381f9SStefano Zampini {
17077e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17087e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17097e8381f9SStefano Zampini   cudaError_t        cerr;
17107e8381f9SStefano Zampini   PetscErrorCode     ierr;
17117e8381f9SStefano Zampini 
17127e8381f9SStefano Zampini   PetscFunctionBegin;
17137e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17147e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17157e8381f9SStefano Zampini 
17167e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17177e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17187e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17197e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17207e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17217e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17227e8381f9SStefano Zampini   }
17237e8381f9SStefano Zampini   PetscFunctionReturn(0);
17247e8381f9SStefano Zampini }
17257e8381f9SStefano Zampini 
17267e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17277e8381f9SStefano Zampini {
17287e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17297e8381f9SStefano Zampini   PetscErrorCode ierr;
17307e8381f9SStefano Zampini 
17317e8381f9SStefano Zampini   PetscFunctionBegin;
17327e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17337e8381f9SStefano Zampini   *array = a->a;
17347e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17357e8381f9SStefano Zampini   PetscFunctionReturn(0);
17367e8381f9SStefano Zampini }
17377e8381f9SStefano Zampini 
17386fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17399ae82921SPaul Mullowney {
1740aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17417c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17429ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1743213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17449ae82921SPaul Mullowney   PetscErrorCode               ierr;
1745aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1746abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1747b06137fdSPaul Mullowney   cudaError_t                  err;
17489ae82921SPaul Mullowney 
17499ae82921SPaul Mullowney   PetscFunctionBegin;
1750fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1754afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
175585ba7357SStefano Zampini 
1756abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
175785ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1758afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
175905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17604863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
176185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1762a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
176334d6c7a5SJose E. Roman     } else {
1764abb89eb1SStefano Zampini       PetscInt nnz;
176585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17667c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1767a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17687c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
176981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1770a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1771a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17729ae82921SPaul Mullowney       try {
17739ae82921SPaul Mullowney         if (a->compressedrow.use) {
17749ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17759ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17769ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17779ae82921SPaul Mullowney         } else {
1778213423ffSJunchao Zhang           m    = A->rmap->n;
1779213423ffSJunchao Zhang           ii   = a->i;
1780e6e9a74fSStefano Zampini           ridx = NULL;
17819ae82921SPaul Mullowney         }
1782abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785abb89eb1SStefano Zampini         else nnz = a->nz;
17869ae82921SPaul Mullowney 
178785ba7357SStefano Zampini         /* create cusparse matrix */
1788abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1789aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
179057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
179157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
179257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17939ae82921SPaul Mullowney 
1794afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17957656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17967656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17987656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17997656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
180057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1801b06137fdSPaul Mullowney 
1802aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804aa372e3fSPaul Mullowney           /* set the matrix */
1805afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1806afb2bd1cSJunchao Zhang           mat->num_rows = m;
1807afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1808abb89eb1SStefano Zampini           mat->num_entries = nnz;
1809afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18119ae82921SPaul Mullowney 
1812abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1813abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1814aa372e3fSPaul Mullowney 
1815abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1816abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1817aa372e3fSPaul Mullowney 
1818aa372e3fSPaul Mullowney           /* assign the pointer */
1819afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1820afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1823afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1824afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1826afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828afb2bd1cSJunchao Zhang           }
1829afb2bd1cSJunchao Zhang          #endif
1830aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833afb2bd1cSJunchao Zhang          #else
1834afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1835afb2bd1cSJunchao Zhang           mat->num_rows = m;
1836afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1837abb89eb1SStefano Zampini           mat->num_entries = nnz;
1838afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1840aa372e3fSPaul Mullowney 
1841abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1842abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1843aa372e3fSPaul Mullowney 
1844abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1845abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1846aa372e3fSPaul Mullowney 
1847aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
184857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1853afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1854afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
185557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856aa372e3fSPaul Mullowney           /* assign the pointer */
1857aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1858aa372e3fSPaul Mullowney 
1859afb2bd1cSJunchao Zhang           if (mat) {
1860afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1861afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1864087f3262SPaul Mullowney           }
1865afb2bd1cSJunchao Zhang          #endif
1866087f3262SPaul Mullowney         }
1867ca45077fSPaul Mullowney 
1868aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1869213423ffSJunchao Zhang         if (a->compressedrow.use) {
1870213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1871aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1872aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1873213423ffSJunchao Zhang           tmp = m;
1874213423ffSJunchao Zhang         } else {
1875213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1876213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1877213423ffSJunchao Zhang           tmp = 0;
1878213423ffSJunchao Zhang         }
1879213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1880aa372e3fSPaul Mullowney 
1881aa372e3fSPaul Mullowney         /* assign the pointer */
1882aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18839ae82921SPaul Mullowney       } catch(char *ex) {
18849ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18859ae82921SPaul Mullowney       }
188605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
188785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
188834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
188934d6c7a5SJose E. Roman     }
1890abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18919ae82921SPaul Mullowney   }
18929ae82921SPaul Mullowney   PetscFunctionReturn(0);
18939ae82921SPaul Mullowney }
18949ae82921SPaul Mullowney 
1895c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1896aa372e3fSPaul Mullowney {
1897aa372e3fSPaul Mullowney   template <typename Tuple>
1898aa372e3fSPaul Mullowney   __host__ __device__
1899aa372e3fSPaul Mullowney   void operator()(Tuple t)
1900aa372e3fSPaul Mullowney   {
1901aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902aa372e3fSPaul Mullowney   }
1903aa372e3fSPaul Mullowney };
1904aa372e3fSPaul Mullowney 
19057e8381f9SStefano Zampini struct VecCUDAEquals
19067e8381f9SStefano Zampini {
19077e8381f9SStefano Zampini   template <typename Tuple>
19087e8381f9SStefano Zampini   __host__ __device__
19097e8381f9SStefano Zampini   void operator()(Tuple t)
19107e8381f9SStefano Zampini   {
19117e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19127e8381f9SStefano Zampini   }
19137e8381f9SStefano Zampini };
19147e8381f9SStefano Zampini 
1915e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1916e6e9a74fSStefano Zampini {
1917e6e9a74fSStefano Zampini   template <typename Tuple>
1918e6e9a74fSStefano Zampini   __host__ __device__
1919e6e9a74fSStefano Zampini   void operator()(Tuple t)
1920e6e9a74fSStefano Zampini   {
1921e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1922e6e9a74fSStefano Zampini   }
1923e6e9a74fSStefano Zampini };
1924e6e9a74fSStefano Zampini 
1925afb2bd1cSJunchao Zhang struct MatMatCusparse {
1926ccdfe979SStefano Zampini   PetscBool             cisdense;
1927ccdfe979SStefano Zampini   PetscScalar           *Bt;
1928ccdfe979SStefano Zampini   Mat                   X;
1929fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1931fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1932afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1934afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1935afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1936afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1937afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1939fcdce8c4SStefano Zampini   void                  *mmBuffer;
1940fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1942afb2bd1cSJunchao Zhang #endif
1943afb2bd1cSJunchao Zhang };
1944ccdfe979SStefano Zampini 
1945ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946ccdfe979SStefano Zampini {
1947ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1948ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1949ccdfe979SStefano Zampini   cudaError_t      cerr;
1950fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1952fcdce8c4SStefano Zampini  #endif
1953ccdfe979SStefano Zampini 
1954ccdfe979SStefano Zampini   PetscFunctionBegin;
1955ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1957afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964afb2bd1cSJunchao Zhang  #endif
1965ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1966ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1967ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1968ccdfe979SStefano Zampini }
1969ccdfe979SStefano Zampini 
1970ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1971ccdfe979SStefano Zampini 
1972ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973ccdfe979SStefano Zampini {
1974ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1975ccdfe979SStefano Zampini   Mat                          A,B;
1976afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1977ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1978ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1979ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1980ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1981ccdfe979SStefano Zampini   const PetscScalar            *barray;
1982ccdfe979SStefano Zampini   PetscScalar                  *carray;
1983ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1984ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1985ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1986ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1987afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1988ccdfe979SStefano Zampini 
1989ccdfe979SStefano Zampini   PetscFunctionBegin;
1990ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1991ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1993ccdfe979SStefano Zampini   A    = product->A;
1994ccdfe979SStefano Zampini   B    = product->B;
1995ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1996ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1999ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2001ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002ccdfe979SStefano Zampini   switch (product->type) {
2003ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2004ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2005ccdfe979SStefano Zampini     mat = cusp->mat;
2006ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007ccdfe979SStefano Zampini     m   = A->rmap->n;
2008ccdfe979SStefano Zampini     n   = B->cmap->n;
2009ccdfe979SStefano Zampini     break;
2010ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20111a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2012e6e9a74fSStefano Zampini       mat = cusp->mat;
2013e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2014e6e9a74fSStefano Zampini     } else {
20151a2c6b5cSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2016ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2017ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018e6e9a74fSStefano Zampini     }
2019ccdfe979SStefano Zampini     m = A->cmap->n;
2020ccdfe979SStefano Zampini     n = B->cmap->n;
2021ccdfe979SStefano Zampini     break;
2022ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2023ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2024ccdfe979SStefano Zampini     mat = cusp->mat;
2025ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026ccdfe979SStefano Zampini     m   = A->rmap->n;
2027ccdfe979SStefano Zampini     n   = B->rmap->n;
2028ccdfe979SStefano Zampini     break;
2029ccdfe979SStefano Zampini   default:
2030ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031ccdfe979SStefano Zampini   }
2032ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2034ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2035ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2036afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2037ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2038afb2bd1cSJunchao Zhang 
2039ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2040c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2042c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2043c8378d12SStefano Zampini   } else {
2044c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2045c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2046c8378d12SStefano Zampini   }
2047c8378d12SStefano Zampini 
2048c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2049afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053fcdce8c4SStefano Zampini     size_t mmBufferSize;
2054afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2056afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2058afb2bd1cSJunchao Zhang     }
2059c8378d12SStefano Zampini 
2060afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2064afb2bd1cSJunchao Zhang     }
2065afb2bd1cSJunchao Zhang 
2066afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2067afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2068afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2071afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073afb2bd1cSJunchao Zhang     }
2074afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2077fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2082fcdce8c4SStefano Zampini     }
2083afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2084afb2bd1cSJunchao Zhang   } else {
2085afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2086afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089afb2bd1cSJunchao Zhang   }
2090afb2bd1cSJunchao Zhang 
2091afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2092afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2095fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096afb2bd1cSJunchao Zhang  #else
2097afb2bd1cSJunchao Zhang   PetscInt k;
2098afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2099ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2101ccdfe979SStefano Zampini     cublasStatus_t cerr;
2102ccdfe979SStefano Zampini 
2103ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2104ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2106ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2107ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2108ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109ccdfe979SStefano Zampini     blda = B->cmap->n;
2110afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2111afb2bd1cSJunchao Zhang   } else {
2112afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2113ccdfe979SStefano Zampini   }
2114ccdfe979SStefano Zampini 
2115afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2118ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2119ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2120ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2121ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2123afb2bd1cSJunchao Zhang  #endif
2124afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2126c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2127ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2128ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2129ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2130ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2131ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2132ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2133ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2134ccdfe979SStefano Zampini   } else {
2135ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2136ccdfe979SStefano Zampini   }
2137ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2138ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2139ccdfe979SStefano Zampini   }
2140ccdfe979SStefano Zampini   if (!biscuda) {
2141ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2142ccdfe979SStefano Zampini   }
2143ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2144ccdfe979SStefano Zampini }
2145ccdfe979SStefano Zampini 
2146ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147ccdfe979SStefano Zampini {
2148ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2149ccdfe979SStefano Zampini   Mat                A,B;
2150ccdfe979SStefano Zampini   PetscInt           m,n;
2151ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2152ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2153ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2154ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2155ccdfe979SStefano Zampini 
2156ccdfe979SStefano Zampini   PetscFunctionBegin;
2157ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2158ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159ccdfe979SStefano Zampini   A    = product->A;
2160ccdfe979SStefano Zampini   B    = product->B;
2161ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2162ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165ccdfe979SStefano Zampini   switch (product->type) {
2166ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2167ccdfe979SStefano Zampini     m = A->rmap->n;
2168ccdfe979SStefano Zampini     n = B->cmap->n;
2169ccdfe979SStefano Zampini     break;
2170ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2171ccdfe979SStefano Zampini     m = A->cmap->n;
2172ccdfe979SStefano Zampini     n = B->cmap->n;
2173ccdfe979SStefano Zampini     break;
2174ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2175ccdfe979SStefano Zampini     m = A->rmap->n;
2176ccdfe979SStefano Zampini     n = B->rmap->n;
2177ccdfe979SStefano Zampini     break;
2178ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2179ccdfe979SStefano Zampini     m = B->cmap->n;
2180ccdfe979SStefano Zampini     n = B->cmap->n;
2181ccdfe979SStefano Zampini     break;
2182ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2183ccdfe979SStefano Zampini     m = B->rmap->n;
2184ccdfe979SStefano Zampini     n = B->rmap->n;
2185ccdfe979SStefano Zampini     break;
2186ccdfe979SStefano Zampini   default:
2187ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188ccdfe979SStefano Zampini   }
2189ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2190ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2192ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2193ccdfe979SStefano Zampini 
2194ccdfe979SStefano Zampini   /* product data */
2195ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2196ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2197afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201ccdfe979SStefano Zampini   }
2202afb2bd1cSJunchao Zhang  #endif
2203ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2204ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2206ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2207ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2209ccdfe979SStefano Zampini     } else {
2210ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2211ccdfe979SStefano Zampini     }
2212ccdfe979SStefano Zampini   }
2213ccdfe979SStefano Zampini   C->product->data    = mmdata;
2214ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2215ccdfe979SStefano Zampini 
2216ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2218ccdfe979SStefano Zampini }
2219ccdfe979SStefano Zampini 
2220fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221ccdfe979SStefano Zampini {
2222ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2223fcdce8c4SStefano Zampini   Mat                          A,B;
2224fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2225fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2226fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2228fcdce8c4SStefano Zampini   PetscBool                    flg;
2229ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2230fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2231fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2232fcdce8c4SStefano Zampini   MatProductType               ptype;
2233fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2234fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2236fcdce8c4SStefano Zampini #endif
2237ccdfe979SStefano Zampini 
2238ccdfe979SStefano Zampini   PetscFunctionBegin;
2239ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2240fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2242fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2244fcdce8c4SStefano Zampini   A = product->A;
2245fcdce8c4SStefano Zampini   B = product->B;
2246fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2248fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2251fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2253fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254fcdce8c4SStefano Zampini     goto finalize;
2255fcdce8c4SStefano Zampini   }
2256fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2257fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2258fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2260fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2270fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2271fcdce8c4SStefano Zampini 
2272fcdce8c4SStefano Zampini   ptype = product->type;
2273fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275fcdce8c4SStefano Zampini   switch (ptype) {
2276fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2277fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2278fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2279fcdce8c4SStefano Zampini     break;
2280fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2281fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2282fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2283fcdce8c4SStefano Zampini     break;
2284fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2285fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2286fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2287fcdce8c4SStefano Zampini     break;
2288fcdce8c4SStefano Zampini   default:
2289fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290fcdce8c4SStefano Zampini   }
2291fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2292fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2296fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2298fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2302fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311fcdce8c4SStefano Zampini #else
2312fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317fcdce8c4SStefano Zampini #endif
2318fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2319fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2321fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2322fcdce8c4SStefano Zampini finalize:
2323fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2324fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2325fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2326fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2327fcdce8c4SStefano Zampini   c->reallocs         = 0;
2328fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2329fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2330fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2331fcdce8c4SStefano Zampini   C->num_ass++;
2332ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2333ccdfe979SStefano Zampini }
2334fcdce8c4SStefano Zampini 
2335fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336fcdce8c4SStefano Zampini {
2337fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2338fcdce8c4SStefano Zampini   Mat                          A,B;
2339fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2340fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2341fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2343fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2344fcdce8c4SStefano Zampini   PetscBool                    flg;
2345fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2346fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2347fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2348fcdce8c4SStefano Zampini   MatProductType               ptype;
2349fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2350fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2351fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2352fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2354fcdce8c4SStefano Zampini   size_t                       bufSize2;
2355fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2356fcdce8c4SStefano Zampini #else
2357fcdce8c4SStefano Zampini   int                          cnz;
2358fcdce8c4SStefano Zampini #endif
2359fcdce8c4SStefano Zampini 
2360fcdce8c4SStefano Zampini   PetscFunctionBegin;
2361fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2362fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363fcdce8c4SStefano Zampini   A    = product->A;
2364fcdce8c4SStefano Zampini   B    = product->B;
2365fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2366fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2368fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2370fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2371fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2375fcdce8c4SStefano Zampini 
2376fcdce8c4SStefano Zampini   /* product data */
2377fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2378fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2379fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2380fcdce8c4SStefano Zampini 
2381fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2382fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2383fcdce8c4SStefano Zampini   ptype = product->type;
2384fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2387fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2388fcdce8c4SStefano Zampini   switch (ptype) {
2389fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2390fcdce8c4SStefano Zampini     m = A->rmap->n;
2391fcdce8c4SStefano Zampini     n = B->cmap->n;
2392fcdce8c4SStefano Zampini     k = A->cmap->n;
2393fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2394fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2395fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397fcdce8c4SStefano Zampini     break;
2398fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2399fcdce8c4SStefano Zampini     m = A->cmap->n;
2400fcdce8c4SStefano Zampini     n = B->cmap->n;
2401fcdce8c4SStefano Zampini     k = A->rmap->n;
24021a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2403fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2404fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2405fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406fcdce8c4SStefano Zampini     break;
2407fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2408fcdce8c4SStefano Zampini     m = A->rmap->n;
2409fcdce8c4SStefano Zampini     n = B->rmap->n;
2410fcdce8c4SStefano Zampini     k = A->cmap->n;
24111a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2412fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2413fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2414fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415fcdce8c4SStefano Zampini     break;
2416fcdce8c4SStefano Zampini   default:
2417fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418fcdce8c4SStefano Zampini   }
2419fcdce8c4SStefano Zampini 
2420fcdce8c4SStefano Zampini   /* create cusparse matrix */
2421fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2422fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2423fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2424fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2426fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2427fcdce8c4SStefano Zampini 
2428fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2429fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2431fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2432fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2433fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2434fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436fcdce8c4SStefano Zampini   } else {
2437fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2438fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2439fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2440fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2441fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2442fcdce8c4SStefano Zampini   }
2443fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2444fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2445fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2446fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2447fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2448fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460fcdce8c4SStefano Zampini     c->nz = 0;
2461fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2463fcdce8c4SStefano Zampini     goto finalizesym;
2464fcdce8c4SStefano Zampini   }
2465fcdce8c4SStefano Zampini 
2466fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2469fcdce8c4SStefano Zampini   if (!biscompressed) {
2470fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2471fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2473fcdce8c4SStefano Zampini #endif
2474fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2475fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2477fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2478fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2479fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2480fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2481fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2482fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2483fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2484fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2486fcdce8c4SStefano Zampini     }
2487fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2489fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2491fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2494fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496fcdce8c4SStefano Zampini     }
2497fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2498fcdce8c4SStefano Zampini #endif
2499fcdce8c4SStefano Zampini   }
2500fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502fcdce8c4SStefano Zampini   /* precompute flops count */
2503fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2504fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2505fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2506fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2507fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2508fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2509fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2510fcdce8c4SStefano Zampini       }
2511fcdce8c4SStefano Zampini     }
2512fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2513fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2514fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2515fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2516fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2517fcdce8c4SStefano Zampini     }
2518fcdce8c4SStefano Zampini   } else { /* TODO */
2519fcdce8c4SStefano Zampini     flops = 0.;
2520fcdce8c4SStefano Zampini   }
2521fcdce8c4SStefano Zampini 
2522fcdce8c4SStefano Zampini   mmdata->flops = flops;
2523fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2524fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2528fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2532fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2538fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2543fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2548fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2549fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2552bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2554fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2559fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
256100702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2562fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2565fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571fcdce8c4SStefano Zampini #else
2572fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578fcdce8c4SStefano Zampini   c->nz = cnz;
2579fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2582fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2583fcdce8c4SStefano Zampini 
2584fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593fcdce8c4SStefano Zampini #endif
2594fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2596fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2597fcdce8c4SStefano Zampini finalizesym:
2598fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2599fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2600fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2601fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2602fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2603fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2605fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2608fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2609fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2610fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612fcdce8c4SStefano Zampini   } else {
2613fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2614fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2615fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617fcdce8c4SStefano Zampini   }
2618fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2619fcdce8c4SStefano Zampini     PetscInt r = 0;
2620fcdce8c4SStefano Zampini     c->i[0] = 0;
2621fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2622fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2623fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2624fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2625fcdce8c4SStefano Zampini     }
2626fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627fcdce8c4SStefano Zampini   }
2628fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2629fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2630fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2631fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2632fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2633fcdce8c4SStefano Zampini   c->rmax = 0;
2634fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2635fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2636fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2637fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2638fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2639fcdce8c4SStefano Zampini   }
2640fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2641fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2642fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2643fcdce8c4SStefano Zampini 
2644fcdce8c4SStefano Zampini   C->nonzerostate++;
2645fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2646fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2647fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2648fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2649fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2650fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2651fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2652abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2654fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2655fcdce8c4SStefano Zampini   }
2656fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2658fcdce8c4SStefano Zampini }
2659fcdce8c4SStefano Zampini 
2660fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2661fcdce8c4SStefano Zampini 
2662fcdce8c4SStefano Zampini /* handles sparse or dense B */
2663fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664fcdce8c4SStefano Zampini {
2665fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2666fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2667fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2668fcdce8c4SStefano Zampini 
2669fcdce8c4SStefano Zampini   PetscFunctionBegin;
2670fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2671fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2672abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2674fcdce8c4SStefano Zampini   }
2675fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2676fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2677fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2678fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2679fcdce8c4SStefano Zampini     }
2680fcdce8c4SStefano Zampini   }
2681*65e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2682*65e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
2683*65e4b4d4SStefano Zampini     switch (product->type) {
2684*65e4b4d4SStefano Zampini     case MATPRODUCT_AB:
2685*65e4b4d4SStefano Zampini       if (product->api_user) {
2686*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2687*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2688*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2689*65e4b4d4SStefano Zampini       } else {
2690*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2691*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2692*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2693*65e4b4d4SStefano Zampini       }
2694*65e4b4d4SStefano Zampini       break;
2695*65e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
2696*65e4b4d4SStefano Zampini       if (product->api_user) {
2697*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2698*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2699*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2700*65e4b4d4SStefano Zampini       } else {
2701*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2702*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2703*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2704*65e4b4d4SStefano Zampini       }
2705*65e4b4d4SStefano Zampini       break;
2706*65e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
2707*65e4b4d4SStefano Zampini       if (product->api_user) {
2708*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2709*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2710*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2711*65e4b4d4SStefano Zampini       } else {
2712*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2713*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2714*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2715*65e4b4d4SStefano Zampini       }
2716*65e4b4d4SStefano Zampini       break;
2717*65e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
2718*65e4b4d4SStefano Zampini       if (product->api_user) {
2719*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2720*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2721*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2722*65e4b4d4SStefano Zampini       } else {
2723*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2724*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2725*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2726*65e4b4d4SStefano Zampini       }
2727*65e4b4d4SStefano Zampini       break;
2728*65e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
2729*65e4b4d4SStefano Zampini       if (product->api_user) {
2730*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2731*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2732*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2733*65e4b4d4SStefano Zampini       } else {
2734*65e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2735*65e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2736*65e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2737*65e4b4d4SStefano Zampini       }
2738*65e4b4d4SStefano Zampini       break;
2739*65e4b4d4SStefano Zampini     default:
2740*65e4b4d4SStefano Zampini       break;
2741*65e4b4d4SStefano Zampini     }
2742*65e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2743*65e4b4d4SStefano Zampini   }
2744*65e4b4d4SStefano Zampini   /* dispatch */
2745fcdce8c4SStefano Zampini   if (isdense) {
2746ccdfe979SStefano Zampini     switch (product->type) {
2747ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2748ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2749ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2750ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2751ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2752fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2753fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2754fcdce8c4SStefano Zampini       } else {
2755fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2756fcdce8c4SStefano Zampini       }
2757fcdce8c4SStefano Zampini       break;
2758fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2759fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2760fcdce8c4SStefano Zampini       break;
2761ccdfe979SStefano Zampini     default:
2762ccdfe979SStefano Zampini       break;
2763ccdfe979SStefano Zampini     }
2764fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2765fcdce8c4SStefano Zampini     switch (product->type) {
2766fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2767fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2768fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2769fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2770fcdce8c4SStefano Zampini       break;
2771fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2772fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2773fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2774fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2775fcdce8c4SStefano Zampini       break;
2776fcdce8c4SStefano Zampini     default:
2777fcdce8c4SStefano Zampini       break;
2778fcdce8c4SStefano Zampini     }
2779fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2780fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2781fcdce8c4SStefano Zampini   }
2782ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2783ccdfe979SStefano Zampini }
2784ccdfe979SStefano Zampini 
27856fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27869ae82921SPaul Mullowney {
2787b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27889ae82921SPaul Mullowney 
27899ae82921SPaul Mullowney   PetscFunctionBegin;
2790e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2791e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2792e6e9a74fSStefano Zampini }
2793e6e9a74fSStefano Zampini 
2794e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2795e6e9a74fSStefano Zampini {
2796e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2797e6e9a74fSStefano Zampini 
2798e6e9a74fSStefano Zampini   PetscFunctionBegin;
2799e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2800e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2801e6e9a74fSStefano Zampini }
2802e6e9a74fSStefano Zampini 
2803e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2804e6e9a74fSStefano Zampini {
2805e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2806e6e9a74fSStefano Zampini 
2807e6e9a74fSStefano Zampini   PetscFunctionBegin;
2808e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2809e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2810e6e9a74fSStefano Zampini }
2811e6e9a74fSStefano Zampini 
2812e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2813e6e9a74fSStefano Zampini {
2814e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2815e6e9a74fSStefano Zampini 
2816e6e9a74fSStefano Zampini   PetscFunctionBegin;
2817e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
28189ae82921SPaul Mullowney   PetscFunctionReturn(0);
28199ae82921SPaul Mullowney }
28209ae82921SPaul Mullowney 
28216fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2822ca45077fSPaul Mullowney {
2823b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2824ca45077fSPaul Mullowney 
2825ca45077fSPaul Mullowney   PetscFunctionBegin;
2826e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2827ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2828ca45077fSPaul Mullowney }
2829ca45077fSPaul Mullowney 
2830a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2831a0e72f99SJunchao Zhang {
2832a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2833a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2834a0e72f99SJunchao Zhang }
2835a0e72f99SJunchao Zhang 
2836afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2837e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
28389ae82921SPaul Mullowney {
28399ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2840aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
28419ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2842e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2843b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
284457d48284SJunchao Zhang   cudaError_t                  cerr;
2845aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2846e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2847e6e9a74fSStefano Zampini   PetscBool                    compressed;
2848afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2850afb2bd1cSJunchao Zhang #endif
28516e111a19SKarl Rupp 
28529ae82921SPaul Mullowney   PetscFunctionBegin;
2853e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2854e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2855afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2856d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2857e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2858e6e9a74fSStefano Zampini   }
285934d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
286034d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2861e6e9a74fSStefano Zampini   if (!trans) {
28629ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2863c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2864e6e9a74fSStefano Zampini   } else {
28651a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
2866e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2867e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2868e6e9a74fSStefano Zampini     } else {
28691a2c6b5cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2870e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2871e6e9a74fSStefano Zampini     }
2872e6e9a74fSStefano Zampini   }
2873e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2874e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2875213423ffSJunchao Zhang 
2876e6e9a74fSStefano Zampini   try {
2877e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2878213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2879213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2880afb2bd1cSJunchao Zhang 
288185ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2882e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2883afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2884afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2885afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2886afb2bd1cSJunchao Zhang       */
2887e6e9a74fSStefano Zampini       xptr = xarray;
2888afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2889213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2890afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2891afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2892afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2893afb2bd1cSJunchao Zhang        */
2894afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2895afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2896afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2897afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2898afb2bd1cSJunchao Zhang       }
2899afb2bd1cSJunchao Zhang      #endif
2900e6e9a74fSStefano Zampini     } else {
2901afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2902afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2903afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2904afb2bd1cSJunchao Zhang        */
2905afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2906e6e9a74fSStefano Zampini       dptr = zarray;
2907e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2908afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2909e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2910a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2911e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2912e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2913e6e9a74fSStefano Zampini       }
2914afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2915afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2916afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2917afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2918afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2919afb2bd1cSJunchao Zhang       }
2920afb2bd1cSJunchao Zhang      #endif
2921e6e9a74fSStefano Zampini     }
29229ae82921SPaul Mullowney 
2923afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2924aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2925afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2926afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2927afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2928afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2929afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2930afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2931afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2932afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2933afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2934afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2935afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2936afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2937afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2938afb2bd1cSJunchao Zhang 
2939afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2940afb2bd1cSJunchao Zhang       } else {
2941afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2942afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2943afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2944afb2bd1cSJunchao Zhang       }
2945afb2bd1cSJunchao Zhang 
2946afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2947afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
29481a2c6b5cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2949afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2950afb2bd1cSJunchao Zhang                                beta,
2951afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2952afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2953afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2954afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2955afb2bd1cSJunchao Zhang      #else
29567656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2957e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2958a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2959afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2960aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2961e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
296257d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2963afb2bd1cSJunchao Zhang      #endif
2964aa372e3fSPaul Mullowney     } else {
2965213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2966afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2967afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2968afb2bd1cSJunchao Zhang        #else
2969301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2970e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2971afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2972e6e9a74fSStefano Zampini                                  xptr, beta,
297357d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2974afb2bd1cSJunchao Zhang        #endif
2975a65300a6SPaul Mullowney       }
2976aa372e3fSPaul Mullowney     }
297705035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2978958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2979aa372e3fSPaul Mullowney 
2980e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2981213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2982213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2983213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2984e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2985213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29867656d835SStefano Zampini         }
2987213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2988c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29897656d835SStefano Zampini       }
29907656d835SStefano Zampini 
2991213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2992213423ffSJunchao Zhang       if (compressed) {
2993e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2994a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2995a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2996a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
2997a0e72f99SJunchao Zhang          */
2998a0e72f99SJunchao Zhang        #if 0
2999a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3000a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3001a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3002e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3003c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3004a0e72f99SJunchao Zhang        #else
3005a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3006a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3007a0e72f99SJunchao Zhang        #endif
300805035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3009958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3010e6e9a74fSStefano Zampini       }
3011e6e9a74fSStefano Zampini     } else {
3012e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3013e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3014e6e9a74fSStefano Zampini       }
3015e6e9a74fSStefano Zampini     }
3016e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3017213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3018213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
30199ae82921SPaul Mullowney   } catch(char *ex) {
30209ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
30219ae82921SPaul Mullowney   }
3022e6e9a74fSStefano Zampini   if (yy) {
3023958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3024e6e9a74fSStefano Zampini   } else {
3025e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3026e6e9a74fSStefano Zampini   }
30279ae82921SPaul Mullowney   PetscFunctionReturn(0);
30289ae82921SPaul Mullowney }
30299ae82921SPaul Mullowney 
30306fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3031ca45077fSPaul Mullowney {
3032b175d8bbSPaul Mullowney   PetscErrorCode ierr;
30336e111a19SKarl Rupp 
3034ca45077fSPaul Mullowney   PetscFunctionBegin;
3035e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3036ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3037ca45077fSPaul Mullowney }
3038ca45077fSPaul Mullowney 
30396fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
30409ae82921SPaul Mullowney {
30419ae82921SPaul Mullowney   PetscErrorCode              ierr;
3042a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
30439ae82921SPaul Mullowney   PetscFunctionBegin;
3044bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30453fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3046bc3f50f2SPaul Mullowney   }
30473fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
30483fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
3049a587d139SMark   if (d_mat) {
30503fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
30513fa6b06aSMark Adams   }
30523fa6b06aSMark Adams 
30539ae82921SPaul Mullowney   PetscFunctionReturn(0);
30549ae82921SPaul Mullowney }
30559ae82921SPaul Mullowney 
30569ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3057e057df02SPaul Mullowney /*@
30589ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3059e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3060e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3061e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3062e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3063e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
30649ae82921SPaul Mullowney 
3065d083f849SBarry Smith    Collective
30669ae82921SPaul Mullowney 
30679ae82921SPaul Mullowney    Input Parameters:
30689ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
30699ae82921SPaul Mullowney .  m - number of rows
30709ae82921SPaul Mullowney .  n - number of columns
30719ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
30729ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
30730298fd71SBarry Smith          (possibly different for each row) or NULL
30749ae82921SPaul Mullowney 
30759ae82921SPaul Mullowney    Output Parameter:
30769ae82921SPaul Mullowney .  A - the matrix
30779ae82921SPaul Mullowney 
30789ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
30799ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
30809ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
30819ae82921SPaul Mullowney 
30829ae82921SPaul Mullowney    Notes:
30839ae82921SPaul Mullowney    If nnz is given then nz is ignored
30849ae82921SPaul Mullowney 
30859ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
30869ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
30879ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
30889ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
30899ae82921SPaul Mullowney 
30909ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30910298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30929ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30939ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30949ae82921SPaul Mullowney 
30959ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30969ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30979ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30989ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30999ae82921SPaul Mullowney 
31009ae82921SPaul Mullowney    Level: intermediate
31019ae82921SPaul Mullowney 
3102e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
31039ae82921SPaul Mullowney @*/
31049ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
31059ae82921SPaul Mullowney {
31069ae82921SPaul Mullowney   PetscErrorCode ierr;
31079ae82921SPaul Mullowney 
31089ae82921SPaul Mullowney   PetscFunctionBegin;
31099ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
31109ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
31119ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
31129ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
31139ae82921SPaul Mullowney   PetscFunctionReturn(0);
31149ae82921SPaul Mullowney }
31159ae82921SPaul Mullowney 
31166fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
31179ae82921SPaul Mullowney {
31189ae82921SPaul Mullowney   PetscErrorCode              ierr;
31193fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3120ab25e6cbSDominic Meiser 
31219ae82921SPaul Mullowney   PetscFunctionBegin;
31229ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
31233fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
31243fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3125470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
31269ae82921SPaul Mullowney   } else {
3127470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3128aa372e3fSPaul Mullowney   }
31293fa6b06aSMark Adams   if (d_mat) {
31303fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
31313fa6b06aSMark Adams     cudaError_t                err;
31323fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
31333fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
31343fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
31353fa6b06aSMark Adams     if (a->compressedrow.use) {
31363fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
31373fa6b06aSMark Adams     }
31383fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
31393fa6b06aSMark Adams   }
3140c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3141ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3142ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3143ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3144fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3145ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
31467e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
31477e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
31489ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
31499ae82921SPaul Mullowney   PetscFunctionReturn(0);
31509ae82921SPaul Mullowney }
31519ae82921SPaul Mullowney 
3152ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
315395639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
31549ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
31559ff858a8SKarl Rupp {
31569ff858a8SKarl Rupp   PetscErrorCode ierr;
31579ff858a8SKarl Rupp 
31589ff858a8SKarl Rupp   PetscFunctionBegin;
31599ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3160ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
31619ff858a8SKarl Rupp   PetscFunctionReturn(0);
31629ff858a8SKarl Rupp }
31639ff858a8SKarl Rupp 
3164039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
316595639643SRichard Tran Mills {
3166e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3167a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3168039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3169039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3170039c6fbaSStefano Zampini   PetscScalar        *ay;
3171039c6fbaSStefano Zampini   const PetscScalar  *ax;
3172039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3173039c6fbaSStefano Zampini   cudaError_t        cerr;
3174e6e9a74fSStefano Zampini 
317595639643SRichard Tran Mills   PetscFunctionBegin;
3176a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3177a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3178039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3179a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3180a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3181a587d139SMark     PetscFunctionReturn(0);
318295639643SRichard Tran Mills   }
3183039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3184a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3185a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3186039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3187039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3188039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3189039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3190039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3191039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3192039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3193039c6fbaSStefano Zampini     if (eq) {
3194039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3195039c6fbaSStefano Zampini     }
3196039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3197039c6fbaSStefano Zampini   }
3198d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3199d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3200039c6fbaSStefano Zampini 
3201039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3202039c6fbaSStefano Zampini     cusparseStatus_t stat;
3203039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3204039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3205039c6fbaSStefano Zampini     size_t           bufferSize;
3206039c6fbaSStefano Zampini     void             *buffer;
3207039c6fbaSStefano Zampini #endif
3208039c6fbaSStefano Zampini 
3209039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3210039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3211039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3212039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3213039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3214039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3215039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3216039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3217039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3218039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3219039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3220039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3221039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3222039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3223039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3224039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3225039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3226039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3227039c6fbaSStefano Zampini #else
3228039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3229039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3230039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3231039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3232039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3233039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3234039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3235039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3236039c6fbaSStefano Zampini #endif
3237039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3238039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3239039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3240039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3241039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3242a587d139SMark     cublasHandle_t cublasv2handle;
3243039c6fbaSStefano Zampini     cublasStatus_t berr;
3244a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3245039c6fbaSStefano Zampini 
3246039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3247039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3248a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3249a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3250a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3251039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3252039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3253a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3254a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3255039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3256039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3257a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3258039c6fbaSStefano Zampini   } else {
3259a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3260d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3261a587d139SMark   }
326295639643SRichard Tran Mills   PetscFunctionReturn(0);
326395639643SRichard Tran Mills }
326495639643SRichard Tran Mills 
326533c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
326633c9ba73SStefano Zampini {
326733c9ba73SStefano Zampini   PetscErrorCode ierr;
326833c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
326933c9ba73SStefano Zampini   PetscScalar    *ay;
327033c9ba73SStefano Zampini   cudaError_t    cerr;
327133c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
327233c9ba73SStefano Zampini   cublasStatus_t berr;
327333c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
327433c9ba73SStefano Zampini 
327533c9ba73SStefano Zampini   PetscFunctionBegin;
327633c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
327733c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
327833c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
327933c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
328033c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
328133c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
328233c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
328333c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
328433c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
328533c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
328633c9ba73SStefano Zampini   PetscFunctionReturn(0);
328733c9ba73SStefano Zampini }
328833c9ba73SStefano Zampini 
32893fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
32903fa6b06aSMark Adams {
32913fa6b06aSMark Adams   PetscErrorCode             ierr;
32927e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3293a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32947e8381f9SStefano Zampini 
32953fa6b06aSMark Adams   PetscFunctionBegin;
32963fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32973fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32987e8381f9SStefano Zampini     if (spptr->mat) {
32997e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
33007e8381f9SStefano Zampini       if (matrix->values) {
33017e8381f9SStefano Zampini         both = PETSC_TRUE;
33027e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
33037e8381f9SStefano Zampini       }
33047e8381f9SStefano Zampini     }
33057e8381f9SStefano Zampini     if (spptr->matTranspose) {
33067e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
33077e8381f9SStefano Zampini       if (matrix->values) {
33087e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
33097e8381f9SStefano Zampini       }
33107e8381f9SStefano Zampini     }
33113fa6b06aSMark Adams   }
3312a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3313a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3314a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
33157e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3316a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
33173fa6b06aSMark Adams 
33183fa6b06aSMark Adams   PetscFunctionReturn(0);
33193fa6b06aSMark Adams }
33203fa6b06aSMark Adams 
3321a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3322a587d139SMark {
3323a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3324a587d139SMark   PetscErrorCode ierr;
3325a587d139SMark 
3326a587d139SMark   PetscFunctionBegin;
3327a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3328a587d139SMark   if (flg) {
3329a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3330a587d139SMark 
333133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3332a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3333a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3334a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3335a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3336a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3337a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3338a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3339a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3340fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3341c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3342a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3343a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3344a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3345a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3346a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3347fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3348a587d139SMark   } else {
334933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3350a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3351a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3352a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3353a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3354a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3355a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3356a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3357a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3358fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3359c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3360a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3361a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3362a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3363a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3364a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3365fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3366a587d139SMark   }
3367a587d139SMark   A->boundtocpu = flg;
3368a587d139SMark   a->inode.use = flg;
3369a587d139SMark   PetscFunctionReturn(0);
3370a587d139SMark }
3371a587d139SMark 
337249735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
33739ae82921SPaul Mullowney {
33749ae82921SPaul Mullowney   PetscErrorCode   ierr;
3375aa372e3fSPaul Mullowney   cusparseStatus_t stat;
337649735bf3SStefano Zampini   Mat              B;
33779ae82921SPaul Mullowney 
33789ae82921SPaul Mullowney   PetscFunctionBegin;
3379832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
338049735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
338149735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
338249735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
338349735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
338449735bf3SStefano Zampini   }
338549735bf3SStefano Zampini   B = *newmat;
338649735bf3SStefano Zampini 
338734136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
338834136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
338934136279SStefano Zampini 
339049735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33919ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3392e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3393e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3394e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3395a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
33961a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3397d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3398d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3399d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3400d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3401d8132acaSStefano Zampini      #endif
34021a2c6b5cSJunchao Zhang       B->spptr = spptr;
34039ae82921SPaul Mullowney     } else {
3404e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3405e6e9a74fSStefano Zampini 
3406e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3407e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3408a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3409e6e9a74fSStefano Zampini       B->spptr = spptr;
34109ae82921SPaul Mullowney     }
3411e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
341249735bf3SStefano Zampini   }
3413693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
34149ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
34151a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
34169ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
341795639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3418693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
34192205254eSKarl Rupp 
3420e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
34219ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3422bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
34239ae82921SPaul Mullowney   PetscFunctionReturn(0);
34249ae82921SPaul Mullowney }
34259ae82921SPaul Mullowney 
342602fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
342702fe1965SBarry Smith {
342802fe1965SBarry Smith   PetscErrorCode ierr;
342902fe1965SBarry Smith 
343002fe1965SBarry Smith   PetscFunctionBegin;
343102fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
34320ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
343302fe1965SBarry Smith   PetscFunctionReturn(0);
343402fe1965SBarry Smith }
343502fe1965SBarry Smith 
34363ca39a21SBarry Smith /*MC
3437e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3438e057df02SPaul Mullowney 
3439e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
34402692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
34412692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3442e057df02SPaul Mullowney 
3443e057df02SPaul Mullowney    Options Database Keys:
3444e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3445aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3446a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3447e057df02SPaul Mullowney 
3448e057df02SPaul Mullowney   Level: beginner
3449e057df02SPaul Mullowney 
34508468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3451e057df02SPaul Mullowney M*/
34527f756511SDominic Meiser 
3453bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
34540f39cd5aSBarry Smith 
34553ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
345642c9c57cSBarry Smith {
345742c9c57cSBarry Smith   PetscErrorCode ierr;
345842c9c57cSBarry Smith 
345942c9c57cSBarry Smith   PetscFunctionBegin;
3460bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
34613ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
34623ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
34633ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
34643ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3465bddcd29dSMark Adams 
346642c9c57cSBarry Smith   PetscFunctionReturn(0);
346742c9c57cSBarry Smith }
346829b38603SBarry Smith 
3469470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
34707f756511SDominic Meiser {
3471e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
34727f756511SDominic Meiser   cusparseStatus_t stat;
34737f756511SDominic Meiser 
34747f756511SDominic Meiser   PetscFunctionBegin;
34757f756511SDominic Meiser   if (*cusparsestruct) {
3476e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3477e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
34787f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
347981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
34807e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
34817e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3482a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
34837e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3484e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
34857f756511SDominic Meiser   }
34867f756511SDominic Meiser   PetscFunctionReturn(0);
34877f756511SDominic Meiser }
34887f756511SDominic Meiser 
34897f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
34907f756511SDominic Meiser {
34917f756511SDominic Meiser   PetscFunctionBegin;
34927f756511SDominic Meiser   if (*mat) {
34937f756511SDominic Meiser     delete (*mat)->values;
34947f756511SDominic Meiser     delete (*mat)->column_indices;
34957f756511SDominic Meiser     delete (*mat)->row_offsets;
34967f756511SDominic Meiser     delete *mat;
34977f756511SDominic Meiser     *mat = 0;
34987f756511SDominic Meiser   }
34997f756511SDominic Meiser   PetscFunctionReturn(0);
35007f756511SDominic Meiser }
35017f756511SDominic Meiser 
3502470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
35037f756511SDominic Meiser {
35047f756511SDominic Meiser   cusparseStatus_t stat;
35057f756511SDominic Meiser   PetscErrorCode   ierr;
35067f756511SDominic Meiser 
35077f756511SDominic Meiser   PetscFunctionBegin;
35087f756511SDominic Meiser   if (*trifactor) {
350957d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3510afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
35117f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
35121b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
35132cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3514afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
35151b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3516afb2bd1cSJunchao Zhang    #endif
3517da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
35187f756511SDominic Meiser   }
35197f756511SDominic Meiser   PetscFunctionReturn(0);
35207f756511SDominic Meiser }
35217f756511SDominic Meiser 
3522470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
35237f756511SDominic Meiser {
35247f756511SDominic Meiser   CsrMatrix        *mat;
35257f756511SDominic Meiser   cusparseStatus_t stat;
35267f756511SDominic Meiser   cudaError_t      err;
35277f756511SDominic Meiser 
35287f756511SDominic Meiser   PetscFunctionBegin;
35297f756511SDominic Meiser   if (*matstruct) {
35307f756511SDominic Meiser     if ((*matstruct)->mat) {
35317f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3532afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3533afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3534afb2bd1cSJunchao Zhang        #else
35357f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
353657d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3537afb2bd1cSJunchao Zhang        #endif
35387f756511SDominic Meiser       } else {
35397f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
35407f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
35417f756511SDominic Meiser       }
35427f756511SDominic Meiser     }
354357d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
35447f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3545afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
35467656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
35477656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3548afb2bd1cSJunchao Zhang 
3549afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3550afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3551afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3552afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3553afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3554afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3555afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3556afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3557afb2bd1cSJunchao Zhang       }
3558afb2bd1cSJunchao Zhang     }
3559afb2bd1cSJunchao Zhang    #endif
35607f756511SDominic Meiser     delete *matstruct;
35617e8381f9SStefano Zampini     *matstruct = NULL;
35627f756511SDominic Meiser   }
35637f756511SDominic Meiser   PetscFunctionReturn(0);
35647f756511SDominic Meiser }
35657f756511SDominic Meiser 
3566ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
35677f756511SDominic Meiser {
3568e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3569e6e9a74fSStefano Zampini 
35707f756511SDominic Meiser   PetscFunctionBegin;
35717f756511SDominic Meiser   if (*trifactors) {
3572e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3573e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3574e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3575e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
35767f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
35777f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
35787f756511SDominic Meiser     delete (*trifactors)->workVector;
35797e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
35807e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
35817e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3582bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3583bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3584ccdfe979SStefano Zampini   }
3585ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3586ccdfe979SStefano Zampini }
3587ccdfe979SStefano Zampini 
3588ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3589ccdfe979SStefano Zampini {
3590e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3591ccdfe979SStefano Zampini   cusparseHandle_t handle;
3592ccdfe979SStefano Zampini   cusparseStatus_t stat;
3593ccdfe979SStefano Zampini 
3594ccdfe979SStefano Zampini   PetscFunctionBegin;
3595ccdfe979SStefano Zampini   if (*trifactors) {
3596e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35977f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
359857d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35997f756511SDominic Meiser     }
3600e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
36017f756511SDominic Meiser   }
36027f756511SDominic Meiser   PetscFunctionReturn(0);
36037f756511SDominic Meiser }
36047e8381f9SStefano Zampini 
36057e8381f9SStefano Zampini struct IJCompare
36067e8381f9SStefano Zampini {
36077e8381f9SStefano Zampini   __host__ __device__
36087e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
36097e8381f9SStefano Zampini   {
36107e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
36117e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
36127e8381f9SStefano Zampini     return false;
36137e8381f9SStefano Zampini   }
36147e8381f9SStefano Zampini };
36157e8381f9SStefano Zampini 
36167e8381f9SStefano Zampini struct IJEqual
36177e8381f9SStefano Zampini {
36187e8381f9SStefano Zampini   __host__ __device__
36197e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
36207e8381f9SStefano Zampini   {
36217e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
36227e8381f9SStefano Zampini     return true;
36237e8381f9SStefano Zampini   }
36247e8381f9SStefano Zampini };
36257e8381f9SStefano Zampini 
36267e8381f9SStefano Zampini struct IJDiff
36277e8381f9SStefano Zampini {
36287e8381f9SStefano Zampini   __host__ __device__
36297e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
36307e8381f9SStefano Zampini   {
36317e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
36327e8381f9SStefano Zampini   }
36337e8381f9SStefano Zampini };
36347e8381f9SStefano Zampini 
36357e8381f9SStefano Zampini struct IJSum
36367e8381f9SStefano Zampini {
36377e8381f9SStefano Zampini   __host__ __device__
36387e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
36397e8381f9SStefano Zampini   {
36407e8381f9SStefano Zampini     return t1||t2;
36417e8381f9SStefano Zampini   }
36427e8381f9SStefano Zampini };
36437e8381f9SStefano Zampini 
36447e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3645e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
36467e8381f9SStefano Zampini {
36477e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3648fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3649bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
365008391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
36517e8381f9SStefano Zampini   CsrMatrix                             *matrix;
36527e8381f9SStefano Zampini   PetscErrorCode                        ierr;
36537e8381f9SStefano Zampini   cudaError_t                           cerr;
36547e8381f9SStefano Zampini   PetscInt                              n;
36557e8381f9SStefano Zampini 
36567e8381f9SStefano Zampini   PetscFunctionBegin;
36577e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
36587e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
36597e8381f9SStefano Zampini   if (!cusp->cooPerm) {
36607e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
36617e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
36627e8381f9SStefano Zampini     PetscFunctionReturn(0);
36637e8381f9SStefano Zampini   }
36647e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
36657e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3666e61fc153SStefano Zampini   if (!v) {
3667e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3668e61fc153SStefano Zampini     goto finalize;
36697e8381f9SStefano Zampini   }
3670e61fc153SStefano Zampini   n = cusp->cooPerm->size();
367108391a17SStefano Zampini   if (isCudaMem(v)) {
367208391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
367308391a17SStefano Zampini   } else {
3674e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3675e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
367608391a17SStefano Zampini     d_v = cooPerm_v->data();
3677e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
367808391a17SStefano Zampini   }
3679bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3680e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
36817e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3682bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
368308391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3684e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3685e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3686e61fc153SStefano Zampini       delete cooPerm_w;
36877e8381f9SStefano Zampini     } else {
368808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36897e8381f9SStefano Zampini                                                                 matrix->values->begin()));
369008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36917e8381f9SStefano Zampini                                                                 matrix->values->end()));
36927e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
36937e8381f9SStefano Zampini     }
36947e8381f9SStefano Zampini   } else {
3695e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
369608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3697e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36987e8381f9SStefano Zampini     } else {
369908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
37007e8381f9SStefano Zampini                                                                 matrix->values->begin()));
370108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
37027e8381f9SStefano Zampini                                                                 matrix->values->end()));
37037e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
37047e8381f9SStefano Zampini     }
37057e8381f9SStefano Zampini   }
37067e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3707bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3708e61fc153SStefano Zampini finalize:
3709e61fc153SStefano Zampini   delete cooPerm_v;
37107e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3711e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3712fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3713fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3714fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3715fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3716fcdce8c4SStefano Zampini   a->reallocs         = 0;
3717fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3718fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3719fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3720fcdce8c4SStefano Zampini   A->num_ass++;
37217e8381f9SStefano Zampini   PetscFunctionReturn(0);
37227e8381f9SStefano Zampini }
37237e8381f9SStefano Zampini 
3724a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3725a49f1ed0SStefano Zampini {
3726a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3727a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3728a49f1ed0SStefano Zampini 
3729a49f1ed0SStefano Zampini   PetscFunctionBegin;
3730a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3731a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3732a49f1ed0SStefano Zampini   if (destroy) {
3733a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3734a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3735a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3736a49f1ed0SStefano Zampini   }
37371a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3738a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3739a49f1ed0SStefano Zampini }
3740a49f1ed0SStefano Zampini 
37417e8381f9SStefano Zampini #include <thrust/binary_search.h>
3742e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
37437e8381f9SStefano Zampini {
37447e8381f9SStefano Zampini   PetscErrorCode     ierr;
37457e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
37467e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
37477e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
37487e8381f9SStefano Zampini   cudaError_t        cerr;
37497e8381f9SStefano Zampini 
37507e8381f9SStefano Zampini   PetscFunctionBegin;
37517e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
37527e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
37537e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
37547e8381f9SStefano Zampini   if (n != cooPerm_n) {
37557e8381f9SStefano Zampini     delete cusp->cooPerm;
37567e8381f9SStefano Zampini     delete cusp->cooPerm_a;
37577e8381f9SStefano Zampini     cusp->cooPerm = NULL;
37587e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
37597e8381f9SStefano Zampini   }
37607e8381f9SStefano Zampini   if (n) {
37617e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
37627e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
37637e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
37647e8381f9SStefano Zampini 
37657e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
37667e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
37677e8381f9SStefano Zampini 
37687e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
37697e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
37707e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
37717e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
37727e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
37737e8381f9SStefano Zampini 
377408391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
37757e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
37767e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
37777e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
37787e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
37797e8381f9SStefano Zampini 
37807e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
37817e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
37827e8381f9SStefano Zampini       delete cusp->cooPerm_a;
37837e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
37847e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
37857e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
37867e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
37877e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
37887e8381f9SStefano Zampini       w[0] = 0;
37897e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
37907e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
37917e8381f9SStefano Zampini     }
37927e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
37937e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
37947e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
37957e8381f9SStefano Zampini                         ii.begin());
379608391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
379708391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37987e8381f9SStefano Zampini 
37997e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
38007e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
38017e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
38027e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
38037e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
38047e8381f9SStefano Zampini     a->i[0] = 0;
38057e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
38067e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3807fcdce8c4SStefano Zampini     a->rmax = 0;
38087e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
38097e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
38107e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
38117e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
38127e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
38137e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
38147e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
38157e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
38167e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3817fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
38187e8381f9SStefano Zampini     }
3819fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
38207e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
38217e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3822fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
38237e8381f9SStefano Zampini   } else {
38247e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
38257e8381f9SStefano Zampini   }
3826e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
38277e8381f9SStefano Zampini 
38287e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3829e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3830e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
38317e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
38327e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
38337e8381f9SStefano Zampini   A->nonzerostate++;
38347e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3835a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
38367e8381f9SStefano Zampini 
38377e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
38387e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
38397e8381f9SStefano Zampini   PetscFunctionReturn(0);
38407e8381f9SStefano Zampini }
3841ed502f03SStefano Zampini 
3842ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3843ed502f03SStefano Zampini {
3844ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845ed502f03SStefano Zampini   CsrMatrix          *csr;
3846ed502f03SStefano Zampini   PetscErrorCode     ierr;
3847ed502f03SStefano Zampini 
3848ed502f03SStefano Zampini   PetscFunctionBegin;
3849ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3850ed502f03SStefano Zampini   PetscValidPointer(a,2);
3851ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3852ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3853ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
385433c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3855ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3856ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3857ed502f03SStefano Zampini   *a = csr->values->data().get();
3858ed502f03SStefano Zampini   PetscFunctionReturn(0);
3859ed502f03SStefano Zampini }
3860ed502f03SStefano Zampini 
3861ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3862ed502f03SStefano Zampini {
3863ed502f03SStefano Zampini   PetscFunctionBegin;
3864ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3865ed502f03SStefano Zampini   PetscValidPointer(a,2);
3866ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3867ed502f03SStefano Zampini   *a = NULL;
3868ed502f03SStefano Zampini   PetscFunctionReturn(0);
3869ed502f03SStefano Zampini }
3870ed502f03SStefano Zampini 
3871039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3872039c6fbaSStefano Zampini {
3873039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874039c6fbaSStefano Zampini   CsrMatrix          *csr;
3875039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3876039c6fbaSStefano Zampini 
3877039c6fbaSStefano Zampini   PetscFunctionBegin;
3878039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3879039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3880039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3881039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3882039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
388333c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3884039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3885039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3886039c6fbaSStefano Zampini   *a = csr->values->data().get();
3887039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3888a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3889039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3890039c6fbaSStefano Zampini }
3891039c6fbaSStefano Zampini 
3892039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3893039c6fbaSStefano Zampini {
3894039c6fbaSStefano Zampini   PetscErrorCode ierr;
3895039c6fbaSStefano Zampini 
3896039c6fbaSStefano Zampini   PetscFunctionBegin;
3897039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3898039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3899039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3900039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3901039c6fbaSStefano Zampini   *a = NULL;
3902039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3903039c6fbaSStefano Zampini }
3904039c6fbaSStefano Zampini 
3905ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3906ed502f03SStefano Zampini {
3907ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3908ed502f03SStefano Zampini   CsrMatrix          *csr;
3909a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3910ed502f03SStefano Zampini 
3911ed502f03SStefano Zampini   PetscFunctionBegin;
3912ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3913ed502f03SStefano Zampini   PetscValidPointer(a,2);
3914ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3915ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
391633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3917ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3918ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3919ed502f03SStefano Zampini   *a = csr->values->data().get();
3920039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3921a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3922ed502f03SStefano Zampini   PetscFunctionReturn(0);
3923ed502f03SStefano Zampini }
3924ed502f03SStefano Zampini 
3925ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3926ed502f03SStefano Zampini {
3927ed502f03SStefano Zampini   PetscErrorCode ierr;
3928ed502f03SStefano Zampini 
3929ed502f03SStefano Zampini   PetscFunctionBegin;
3930ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3931ed502f03SStefano Zampini   PetscValidPointer(a,2);
3932ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3933ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3934ed502f03SStefano Zampini   *a = NULL;
3935ed502f03SStefano Zampini   PetscFunctionReturn(0);
3936ed502f03SStefano Zampini }
3937ed502f03SStefano Zampini 
3938ed502f03SStefano Zampini struct IJCompare4
3939ed502f03SStefano Zampini {
3940ed502f03SStefano Zampini   __host__ __device__
39412ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3942ed502f03SStefano Zampini   {
3943ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3944ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3945ed502f03SStefano Zampini     return false;
3946ed502f03SStefano Zampini   }
3947ed502f03SStefano Zampini };
3948ed502f03SStefano Zampini 
39498909a122SStefano Zampini struct Shift
39508909a122SStefano Zampini {
3951ed502f03SStefano Zampini   int _shift;
3952ed502f03SStefano Zampini 
3953ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3954ed502f03SStefano Zampini   __host__ __device__
3955ed502f03SStefano Zampini   inline int operator() (const int &c)
3956ed502f03SStefano Zampini   {
3957ed502f03SStefano Zampini     return c + _shift;
3958ed502f03SStefano Zampini   }
3959ed502f03SStefano Zampini };
3960ed502f03SStefano Zampini 
3961ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3962ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3963ed502f03SStefano Zampini {
3964ed502f03SStefano Zampini   PetscErrorCode               ierr;
3965ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3966ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3967ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3968ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3969ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3970ed502f03SStefano Zampini   cusparseStatus_t             stat;
3971ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3972ed502f03SStefano Zampini   cudaError_t                  cerr;
3973ed502f03SStefano Zampini 
3974ed502f03SStefano Zampini   PetscFunctionBegin;
3975ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3976ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3977ed502f03SStefano Zampini   PetscValidPointer(C,4);
3978ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3979ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3980ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3981ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3982ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3983ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3984ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3985ed502f03SStefano Zampini     m     = A->rmap->n;
3986ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3987ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3988ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3989ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3990ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3991ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3992ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3993ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3994ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3995ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3996ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3997ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3998ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3999ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4000ed502f03SStefano Zampini     Ccusp->nrows    = m;
4001ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4002ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4003ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4004ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4005ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4006ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4007ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4008ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4009ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4010ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4011ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4012ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4013ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4014ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4015ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
40161a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
40171a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
4018ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4019ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4020ed502f03SStefano Zampini 
4021ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4022ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4023ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4024ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4025ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4026ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4027ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4028ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4029ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4030ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4031ed502f03SStefano Zampini     if (c->nz) {
40322ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
40332ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
40342ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
40352ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
40362ed87e7eSStefano Zampini 
4037ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4038ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4039ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4040ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4041ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4042ed502f03SStefano Zampini         }
40432ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
40442ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4045ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4046ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4047ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4048ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4049ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4050ed502f03SStefano Zampini         }
40512ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
40522ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4053ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
40542ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
40552ed87e7eSStefano Zampini                               Aroff->data().get(),
40562ed87e7eSStefano Zampini                               Annz,
40572ed87e7eSStefano Zampini                               m,
40582ed87e7eSStefano Zampini                               Acoo->data().get(),
40592ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4060ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
40612ed87e7eSStefano Zampini                               Broff->data().get(),
4062ed502f03SStefano Zampini                               Bnnz,
4063ed502f03SStefano Zampini                               m,
40642ed87e7eSStefano Zampini                               Bcoo->data().get(),
4065ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
40662ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
40672ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
40682ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
40698909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4070ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4071ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
40728909a122SStefano Zampini #else
40738909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
40748909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
40758909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
40768909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
40778909a122SStefano Zampini #endif
40782ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
40792ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
40802ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
40812ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
40822ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
40832ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4084ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4085ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4086ed502f03SStefano Zampini       thrust::advance(p2,Annz);
40872ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
40888909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
40898909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
40908909a122SStefano Zampini #endif
40912ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
40922ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
40932ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
40942ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
40952ed87e7eSStefano Zampini #else
40962ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40972ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40982ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40992ed87e7eSStefano Zampini #endif
4100ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
41012ed87e7eSStefano Zampini                               Ccoo->data().get(),
4102ed502f03SStefano Zampini                               c->nz,
4103ed502f03SStefano Zampini                               m,
4104ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4105ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4106ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4107ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
41082ed87e7eSStefano Zampini       delete wPerm;
41092ed87e7eSStefano Zampini       delete Acoo;
41102ed87e7eSStefano Zampini       delete Bcoo;
41112ed87e7eSStefano Zampini       delete Ccoo;
4112ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4113ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4114ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4115ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4116ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4117ed502f03SStefano Zampini #endif
41181a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4119ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4120ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4121ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4122ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4123ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4124ed502f03SStefano Zampini 
41251a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
41261a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4127a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4128ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4129ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4130ed502f03SStefano Zampini         CcsrT->num_rows = n;
4131ed502f03SStefano Zampini         CcsrT->num_cols = m;
4132ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4133ed502f03SStefano Zampini 
4134ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4135ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4136ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4137ed502f03SStefano Zampini 
4138ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4139ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4140ed502f03SStefano Zampini         if (AT) {
4141ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4142ed502f03SStefano Zampini           thrust::advance(rT,-1);
4143ed502f03SStefano Zampini         }
4144ed502f03SStefano Zampini         if (BT) {
4145ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4146ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4147ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4148ed502f03SStefano Zampini         }
4149ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4150ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4151ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4152ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4153ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4154ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4155ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4156ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4157ed502f03SStefano Zampini 
4158ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4159ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4160ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4161ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4162ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4163ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4164ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4165ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4166ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4167ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4168ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4169ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4170ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4171ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4172ed502f03SStefano Zampini #endif
4173ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4174ed502f03SStefano Zampini       }
4175ed502f03SStefano Zampini     }
4176ed502f03SStefano Zampini 
4177ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4178ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4179ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4180ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4181ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4182ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4183ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4184ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4185ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4186ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4187ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4188ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4189ed502f03SStefano Zampini     } else {
4190ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4191ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4192ed502f03SStefano Zampini     }
4193ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4194ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4195ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4196ed502f03SStefano Zampini     c->maxnz = c->nz;
4197ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4198ed502f03SStefano Zampini     c->rmax = 0;
4199ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4200ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4201ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4202ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4203ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4204ed502f03SStefano Zampini     }
4205ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4206ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4207ed502f03SStefano Zampini     (*C)->nonzerostate++;
4208ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4209ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4210ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4211ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4212ed502f03SStefano Zampini   } else {
4213ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4214ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4215ed502f03SStefano Zampini     if (c->nz) {
4216ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4217ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4218ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4219ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4220ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4221ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4222ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4223ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4224ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4225ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4226ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4227ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4228ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4229ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4230ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4231ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4232ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4233ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4234ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4235ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4236ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4237ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4238ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4239ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4240ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4241ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4242ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4243ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4244ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4245a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
42461a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4247ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4248ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4249ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4250ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4251ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4252ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4253ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4254ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
42551a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4256ed502f03SStefano Zampini       }
4257ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4258ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4259ed502f03SStefano Zampini     }
4260ed502f03SStefano Zampini   }
4261ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4262ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4263ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4264ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4265ed502f03SStefano Zampini   PetscFunctionReturn(0);
4266ed502f03SStefano Zampini }
4267c215019aSStefano Zampini 
4268c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4269c215019aSStefano Zampini {
4270c215019aSStefano Zampini   PetscErrorCode    ierr;
4271c215019aSStefano Zampini   bool              dmem;
4272c215019aSStefano Zampini   const PetscScalar *av;
4273c215019aSStefano Zampini   cudaError_t       cerr;
4274c215019aSStefano Zampini 
4275c215019aSStefano Zampini   PetscFunctionBegin;
4276c215019aSStefano Zampini   dmem = isCudaMem(v);
4277c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4278c215019aSStefano Zampini   if (n && idx) {
4279c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4280c215019aSStefano Zampini     widx.assign(idx,idx+n);
4281c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4282c215019aSStefano Zampini 
4283c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4284c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4285c215019aSStefano Zampini     if (dmem) {
4286c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4287c215019aSStefano Zampini     } else {
4288c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4289c215019aSStefano Zampini       dv = w->data();
4290c215019aSStefano Zampini     }
4291c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4292c215019aSStefano Zampini 
4293c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4294c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4295c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4296c215019aSStefano Zampini     if (w) {
4297c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4298c215019aSStefano Zampini     }
4299c215019aSStefano Zampini     delete w;
4300c215019aSStefano Zampini   } else {
4301c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4302c215019aSStefano Zampini   }
4303c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4304c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4305c215019aSStefano Zampini   PetscFunctionReturn(0);
4306c215019aSStefano Zampini }
4307bddcd29dSMark Adams 
4308bddcd29dSMark Adams /*
4309bddcd29dSMark Adams   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4310bddcd29dSMark Adams 
4311bddcd29dSMark Adams   requires:
4312bddcd29dSMark Adams      structurally symmetric: fix with transpose/column meta data
4313bddcd29dSMark Adams */
4314bddcd29dSMark Adams 
4315bddcd29dSMark Adams /*
4316bddcd29dSMark Adams   The GPU LU factor kernel
4317bddcd29dSMark Adams */
4318bddcd29dSMark Adams __global__
4319bddcd29dSMark Adams void __launch_bounds__(1024,1)
4320bddcd29dSMark Adams mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4321bddcd29dSMark Adams {
4322bddcd29dSMark Adams   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4323bddcd29dSMark Adams   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4324bddcd29dSMark Adams   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4325bddcd29dSMark Adams 
4326bddcd29dSMark Adams   // set i (row+1)
4327bddcd29dSMark Adams   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4328bddcd29dSMark Adams   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4329bddcd29dSMark Adams   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4330bddcd29dSMark Adams     if (rowb < end_i && threadIdx.x==0) {
4331bddcd29dSMark Adams       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4332bddcd29dSMark Adams       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4333bddcd29dSMark Adams     }
4334bddcd29dSMark Adams   }
4335bddcd29dSMark Adams }
4336bddcd29dSMark Adams // copy AIJ to AIJ_BAND
4337bddcd29dSMark Adams __global__
4338bddcd29dSMark Adams void __launch_bounds__(1024,1)
4339bddcd29dSMark Adams mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4340bddcd29dSMark Adams                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4341bddcd29dSMark Adams                                 const int bi_csr[], PetscScalar ba_csr[])
4342bddcd29dSMark Adams {
4343bddcd29dSMark Adams   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4344bddcd29dSMark Adams   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4345bddcd29dSMark Adams   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4346bddcd29dSMark Adams 
4347bddcd29dSMark Adams   // zero B
4348bddcd29dSMark Adams   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4349bddcd29dSMark Adams   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4350bddcd29dSMark Adams     if (rowb < end_i) {
4351bddcd29dSMark Adams       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4352bddcd29dSMark Adams       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4353bddcd29dSMark Adams       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4354bddcd29dSMark Adams         if (j<nzb) {
4355bddcd29dSMark Adams           batmp[j] = 0;
4356bddcd29dSMark Adams         }
4357bddcd29dSMark Adams       }
4358bddcd29dSMark Adams     }
4359bddcd29dSMark Adams   }
4360bddcd29dSMark Adams 
4361bddcd29dSMark Adams   // copy A into B with CSR format -- these two loops can be fused
4362bddcd29dSMark Adams   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4363bddcd29dSMark Adams     if (rowb < end_i) {
4364bddcd29dSMark Adams       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4365bddcd29dSMark Adams       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4366bddcd29dSMark Adams       const PetscScalar *av    = aa_d + ai_d[rowa];
4367bddcd29dSMark Adams       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4368bddcd29dSMark Adams       /* load in initial (unfactored row) */
4369bddcd29dSMark Adams       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4370bddcd29dSMark Adams         if (j<nza) {
4371bddcd29dSMark Adams           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4372bddcd29dSMark Adams           PetscScalar vala = av[j];
4373bddcd29dSMark Adams           batmp[idx] = vala;
4374bddcd29dSMark Adams         }
4375bddcd29dSMark Adams       }
4376bddcd29dSMark Adams     }
4377bddcd29dSMark Adams   }
4378bddcd29dSMark Adams }
4379bddcd29dSMark Adams // print AIJ_BAND
4380bddcd29dSMark Adams __global__
4381bddcd29dSMark Adams void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4382bddcd29dSMark Adams {
4383bddcd29dSMark Adams   // debug
4384bddcd29dSMark Adams   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4385bddcd29dSMark Adams     printf("B (AIJ) n=%d:\n",(int)n);
4386bddcd29dSMark Adams     for (int rowb=0;rowb<n;rowb++) {
4387bddcd29dSMark Adams       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4388bddcd29dSMark Adams       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4389bddcd29dSMark Adams       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4390bddcd29dSMark Adams       printf(" bi=%d\n",bi_csr[rowb+1]);
4391bddcd29dSMark Adams     }
4392bddcd29dSMark Adams   }
4393bddcd29dSMark Adams }
4394bddcd29dSMark Adams // Band LU kernel ---  ba_csr bi_csr
4395bddcd29dSMark Adams __global__
4396bddcd29dSMark Adams void __launch_bounds__(1024,1)
4397bddcd29dSMark Adams mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4398bddcd29dSMark Adams {
4399bddcd29dSMark Adams   extern __shared__ PetscInt smemInt[];
4400bddcd29dSMark Adams   PetscInt        *sm_pkIdx  = &smemInt[0];
4401bddcd29dSMark Adams   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4402bddcd29dSMark Adams   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4403bddcd29dSMark Adams   const PetscInt  start = field*nloc, end = start + nloc;
4404bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4405bddcd29dSMark Adams   auto g = cooperative_groups::this_grid();
4406bddcd29dSMark Adams #endif
4407bddcd29dSMark Adams   // A22 panel update for each row A(1,:) and col A(:,1)
4408bddcd29dSMark Adams   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4409bddcd29dSMark Adams     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4410bddcd29dSMark Adams     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4411bddcd29dSMark Adams     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4412bddcd29dSMark Adams     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4413bddcd29dSMark Adams     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4414bddcd29dSMark Adams     const PetscScalar Bdd = *pBdd;
4415bddcd29dSMark Adams     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4416bddcd29dSMark Adams     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4417bddcd29dSMark Adams       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4418bddcd29dSMark Adams         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4419bddcd29dSMark Adams         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4420bddcd29dSMark Adams         *Aid = *Aid/Bdd;
4421bddcd29dSMark Adams         sm_pkIdx[threadIdx.y] = kIdx;
4422bddcd29dSMark Adams       }
4423bddcd29dSMark Adams       __syncthreads(); // synch on threadIdx.x only
4424bddcd29dSMark Adams       if (idx < nzUd) { /* assuming symmetric structure */
4425bddcd29dSMark Adams         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4426bddcd29dSMark Adams         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4427bddcd29dSMark Adams         PetscScalar *Aij =  Aid + 1;
4428bddcd29dSMark Adams         PetscScalar Lid  = *Aid;
4429bddcd29dSMark Adams         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4430bddcd29dSMark Adams           if (jIdx<nzUd) {
4431bddcd29dSMark Adams             Aij[jIdx] -= Lid*baUd[jIdx];
4432bddcd29dSMark Adams           }
4433bddcd29dSMark Adams         }
4434bddcd29dSMark Adams       }
4435bddcd29dSMark Adams     }
4436bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4437bddcd29dSMark Adams     g.sync();
4438bddcd29dSMark Adams #else
4439bddcd29dSMark Adams     __syncthreads();
4440bddcd29dSMark Adams #endif
4441bddcd29dSMark Adams   } /* endof for (i=0; i<n; i++) { */
4442bddcd29dSMark Adams }
4443bddcd29dSMark Adams 
4444bddcd29dSMark Adams static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4445bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4446bddcd29dSMark Adams {
4447bddcd29dSMark Adams   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4448bddcd29dSMark Adams   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4449bddcd29dSMark Adams   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4450bddcd29dSMark Adams   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4451bddcd29dSMark Adams   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4452bddcd29dSMark Adams   CsrMatrix                    *matrixA;
4453bddcd29dSMark Adams   PetscErrorCode               ierr;
4454bddcd29dSMark Adams   cudaError_t                  cerr;
4455bddcd29dSMark Adams   const PetscInt               n=A->rmap->n, *ic, *r;
4456bddcd29dSMark Adams   const int                    *ai_d, *aj_d;
4457bddcd29dSMark Adams   const PetscScalar            *aa_d;
4458bddcd29dSMark Adams   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4459bddcd29dSMark Adams   int                          *bi_t = cusparseTriFactors->i_band_d;
4460bddcd29dSMark Adams   PetscContainer               container;
4461bddcd29dSMark Adams   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4462bddcd29dSMark Adams 
4463bddcd29dSMark Adams   PetscFunctionBegin;
4464bddcd29dSMark Adams   if (A->rmap->n == 0) {
4465bddcd29dSMark Adams     PetscFunctionReturn(0);
4466bddcd29dSMark Adams   }
4467bddcd29dSMark Adams   // cusparse setup
4468bddcd29dSMark Adams   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4469bddcd29dSMark Adams   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4470bddcd29dSMark Adams   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4471bddcd29dSMark Adams   matrixA = (CsrMatrix*)matstructA->mat;
4472bddcd29dSMark Adams   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4473bddcd29dSMark Adams 
4474bddcd29dSMark Adams   // factor: get Nf if available
4475bddcd29dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4476bddcd29dSMark Adams   if (container) {
4477bddcd29dSMark Adams     PetscInt *pNf=NULL;
4478bddcd29dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4479bddcd29dSMark Adams     Nf = (*pNf)%1000;
4480bddcd29dSMark Adams     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4481bddcd29dSMark Adams   } else Nf = 1;
4482bddcd29dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4483bddcd29dSMark Adams 
4484bddcd29dSMark Adams   // get data
4485bddcd29dSMark Adams   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4486bddcd29dSMark Adams   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4487bddcd29dSMark Adams   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4488bddcd29dSMark Adams   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4489bddcd29dSMark Adams   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4490bddcd29dSMark Adams 
4491bddcd29dSMark Adams   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4492bddcd29dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4493bddcd29dSMark Adams   {
4494bddcd29dSMark Adams     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4495bddcd29dSMark Adams     int gpuid;
4496bddcd29dSMark Adams     cudaDeviceProp prop;
4497bddcd29dSMark Adams     cudaGetDevice(&gpuid);
4498bddcd29dSMark Adams     cudaGetDeviceProperties(&prop, gpuid);
4499bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4500bddcd29dSMark Adams     Ni = 1/nconcurrent;
4501bddcd29dSMark Adams     Ni = 1;
4502bddcd29dSMark Adams #else
4503bddcd29dSMark Adams     nsm = prop.multiProcessorCount;
4504bddcd29dSMark Adams     Ni = nsm/Nf/nconcurrent;
4505bddcd29dSMark Adams #endif
4506bddcd29dSMark Adams     team_size = bw/Ni + !!(bw%Ni);
4507bddcd29dSMark Adams     nVec = PetscMin(bw, 1024/team_size);
4508bddcd29dSMark Adams     ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr);
4509bddcd29dSMark Adams     {
4510bddcd29dSMark Adams       dim3 dimBlockTeam(nVec,team_size);
4511bddcd29dSMark Adams       dim3 dimBlockLeague(Nf,Ni);
4512bddcd29dSMark Adams       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4513bddcd29dSMark Adams       CHECK_LAUNCH_ERROR(); // does a sync
4514bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4515bddcd29dSMark Adams       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4516bddcd29dSMark Adams       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4517bddcd29dSMark Adams #else
4518bddcd29dSMark Adams       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4519bddcd29dSMark Adams #endif
4520bddcd29dSMark Adams       CHECK_LAUNCH_ERROR(); // does a sync
4521bddcd29dSMark Adams #if defined(PETSC_USE_LOG)
4522bddcd29dSMark Adams       ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr);
4523bddcd29dSMark Adams #endif
4524bddcd29dSMark Adams     }
4525bddcd29dSMark Adams   }
4526bddcd29dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4527bddcd29dSMark Adams 
4528bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4529bddcd29dSMark Adams   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4530bddcd29dSMark Adams   B->ops->solvetranspose = NULL; // need transpose
4531bddcd29dSMark Adams   B->ops->matsolve = NULL;
4532bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
4533bddcd29dSMark Adams 
4534bddcd29dSMark Adams   PetscFunctionReturn(0);
4535bddcd29dSMark Adams }
4536bddcd29dSMark Adams 
4537bddcd29dSMark Adams static PetscErrorCode MatrixNfDestroy(void *ptr)
4538bddcd29dSMark Adams {
4539bddcd29dSMark Adams   PetscInt *nf = (PetscInt *)ptr;
4540bddcd29dSMark Adams   PetscErrorCode  ierr;
4541bddcd29dSMark Adams   PetscFunctionBegin;
4542bddcd29dSMark Adams   ierr = PetscFree(nf);CHKERRQ(ierr);
4543bddcd29dSMark Adams   PetscFunctionReturn(0);
4544bddcd29dSMark Adams }
4545bddcd29dSMark Adams 
4546bddcd29dSMark Adams PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4547bddcd29dSMark Adams {
4548bddcd29dSMark Adams   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4549bddcd29dSMark Adams   IS                 isicol;
4550bddcd29dSMark Adams   PetscErrorCode     ierr;
4551bddcd29dSMark Adams   cudaError_t        cerr;
4552bddcd29dSMark Adams   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4553bddcd29dSMark Adams   PetscScalar        *ba_t;
4554bddcd29dSMark Adams   int                *bi_t;
4555bddcd29dSMark Adams   PetscInt           i,n=A->rmap->n,Nf;
4556bddcd29dSMark Adams   PetscInt           nzBcsr,bwL,bwU;
4557bddcd29dSMark Adams   PetscBool          missing;
4558bddcd29dSMark Adams   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4559bddcd29dSMark Adams   PetscContainer               container;
4560bddcd29dSMark Adams 
4561bddcd29dSMark Adams   PetscFunctionBegin;
4562bddcd29dSMark Adams   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4563bddcd29dSMark Adams   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
4564bddcd29dSMark Adams   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4565bddcd29dSMark Adams   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4566bddcd29dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr);
4567bddcd29dSMark Adams   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4568bddcd29dSMark Adams 
4569bddcd29dSMark Adams    // factor: get Nf if available
4570bddcd29dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4571bddcd29dSMark Adams   if (container) {
4572bddcd29dSMark Adams     PetscInt *pNf=NULL;
4573bddcd29dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4574bddcd29dSMark Adams     Nf = (*pNf)%1000;
4575bddcd29dSMark Adams     ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr);
4576bddcd29dSMark Adams     ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr);
4577bddcd29dSMark Adams     *pNf = Nf;
4578bddcd29dSMark Adams     ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr);
4579bddcd29dSMark Adams     ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr);
4580bddcd29dSMark Adams     ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr);
4581bddcd29dSMark Adams     ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
4582bddcd29dSMark Adams   } else Nf = 1;
4583bddcd29dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4584bddcd29dSMark Adams 
4585bddcd29dSMark Adams   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4586bddcd29dSMark Adams   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4587bddcd29dSMark Adams 
4588bddcd29dSMark Adams   ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4589bddcd29dSMark Adams   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr);
4590bddcd29dSMark Adams   b    = (Mat_SeqAIJ*)(B)->data;
4591bddcd29dSMark Adams 
4592bddcd29dSMark Adams   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4593bddcd29dSMark Adams   bwL = bwU = 0;
4594bddcd29dSMark Adams   for (int rwb=0; rwb<n; rwb++) {
4595bddcd29dSMark Adams     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4596bddcd29dSMark Adams     for (int j=0;j<anz;j++) {
4597bddcd29dSMark Adams       PetscInt colb = ic[ajtmp[j]];
4598bddcd29dSMark Adams       if (colb<rwa) { // L
4599bddcd29dSMark Adams         if (rwa-colb > bwL) bwL = rwa-colb;
4600bddcd29dSMark Adams       } else {
4601bddcd29dSMark Adams         if (colb-rwa > bwU) bwU = colb-rwa;
4602bddcd29dSMark Adams       }
4603bddcd29dSMark Adams     }
4604bddcd29dSMark Adams   }
4605bddcd29dSMark Adams   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4606bddcd29dSMark Adams   /* only support structurally symmetric, but it might work */
4607bddcd29dSMark Adams   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4608bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
4609bddcd29dSMark Adams   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4610bddcd29dSMark Adams   b->maxnz = b->nz = nzBcsr;
4611bddcd29dSMark Adams   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4612bddcd29dSMark Adams   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4613bddcd29dSMark Adams   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4614bddcd29dSMark Adams   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4615bddcd29dSMark Adams   cusparseTriFactors->a_band_d = ba_t;
4616bddcd29dSMark Adams   cusparseTriFactors->i_band_d = bi_t;
4617bddcd29dSMark Adams   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4618bddcd29dSMark Adams   ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr);
4619bddcd29dSMark Adams   {
4620bddcd29dSMark Adams     dim3 dimBlockTeam(1,128);
4621bddcd29dSMark Adams     dim3 dimBlockLeague(Nf,1);
4622bddcd29dSMark Adams     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4623bddcd29dSMark Adams   }
4624bddcd29dSMark Adams   CHECK_LAUNCH_ERROR(); // does a sync
4625bddcd29dSMark Adams 
4626bddcd29dSMark Adams   // setup data
4627bddcd29dSMark Adams   if (!cusparseTriFactors->rpermIndices) {
4628bddcd29dSMark Adams     const PetscInt *r;
4629bddcd29dSMark Adams 
4630bddcd29dSMark Adams     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4631bddcd29dSMark Adams     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4632bddcd29dSMark Adams     cusparseTriFactors->rpermIndices->assign(r, r+n);
4633bddcd29dSMark Adams     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4634bddcd29dSMark Adams     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4635bddcd29dSMark Adams   }
4636bddcd29dSMark Adams   /* upper triangular indices */
4637bddcd29dSMark Adams   if (!cusparseTriFactors->cpermIndices) {
4638bddcd29dSMark Adams     const PetscInt *c;
4639bddcd29dSMark Adams 
4640bddcd29dSMark Adams     ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr);
4641bddcd29dSMark Adams     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4642bddcd29dSMark Adams     cusparseTriFactors->cpermIndices->assign(c, c+n);
4643bddcd29dSMark Adams     ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr);
4644bddcd29dSMark Adams     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4645bddcd29dSMark Adams   }
4646bddcd29dSMark Adams 
4647bddcd29dSMark Adams   /* put together the new matrix */
4648bddcd29dSMark Adams   b->free_a       = PETSC_FALSE;
4649bddcd29dSMark Adams   b->free_ij      = PETSC_FALSE;
4650bddcd29dSMark Adams   b->singlemalloc = PETSC_FALSE;
4651bddcd29dSMark Adams   b->ilen = NULL;
4652bddcd29dSMark Adams   b->imax = NULL;
4653bddcd29dSMark Adams   b->row  = isrow;
4654bddcd29dSMark Adams   b->col  = iscol;
4655bddcd29dSMark Adams   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4656bddcd29dSMark Adams   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4657bddcd29dSMark Adams   b->icol = isicol;
4658bddcd29dSMark Adams   ierr    = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr);
4659bddcd29dSMark Adams 
4660bddcd29dSMark Adams   B->factortype            = MAT_FACTOR_LU;
4661bddcd29dSMark Adams   B->info.factor_mallocs   = 0;
4662bddcd29dSMark Adams   B->info.fill_ratio_given = 0;
4663bddcd29dSMark Adams 
4664bddcd29dSMark Adams   if (ai[n]) {
4665bddcd29dSMark Adams     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4666bddcd29dSMark Adams   } else {
4667bddcd29dSMark Adams     B->info.fill_ratio_needed = 0.0;
4668bddcd29dSMark Adams   }
4669bddcd29dSMark Adams #if defined(PETSC_USE_INFO)
4670bddcd29dSMark Adams   if (ai[n] != 0) {
4671bddcd29dSMark Adams     PetscReal af = B->info.fill_ratio_needed;
4672bddcd29dSMark Adams     ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr);
4673bddcd29dSMark Adams   } else {
4674bddcd29dSMark Adams     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
4675bddcd29dSMark Adams   }
4676bddcd29dSMark Adams #endif
4677bddcd29dSMark Adams   if (a->inode.size) {
4678bddcd29dSMark Adams     ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr);
4679bddcd29dSMark Adams   }
4680bddcd29dSMark Adams   ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr);
4681bddcd29dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4682bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
4683bddcd29dSMark Adams 
4684bddcd29dSMark Adams   PetscFunctionReturn(0);
4685bddcd29dSMark Adams }
4686bddcd29dSMark Adams 
4687bddcd29dSMark Adams /* Use -pc_factor_mat_solver_type cusparseband */
4688bddcd29dSMark Adams PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4689bddcd29dSMark Adams {
4690bddcd29dSMark Adams   PetscFunctionBegin;
4691bddcd29dSMark Adams   *type = MATSOLVERCUSPARSEBAND;
4692bddcd29dSMark Adams   PetscFunctionReturn(0);
4693bddcd29dSMark Adams }
4694bddcd29dSMark Adams 
4695bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4696bddcd29dSMark Adams {
4697bddcd29dSMark Adams   PetscErrorCode ierr;
4698bddcd29dSMark Adams   PetscInt       n = A->rmap->n;
4699bddcd29dSMark Adams 
4700bddcd29dSMark Adams   PetscFunctionBegin;
4701bddcd29dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
4702bddcd29dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
4703bddcd29dSMark Adams   (*B)->factortype = ftype;
4704bddcd29dSMark Adams   (*B)->useordering = PETSC_TRUE;
4705bddcd29dSMark Adams   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4706bddcd29dSMark Adams 
4707bddcd29dSMark Adams   if (ftype == MAT_FACTOR_LU) {
4708bddcd29dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
4709bddcd29dSMark Adams     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4710bddcd29dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4711bddcd29dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4712bddcd29dSMark Adams 
4713bddcd29dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4714bddcd29dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr);
4715bddcd29dSMark Adams   PetscFunctionReturn(0);
4716bddcd29dSMark Adams }
4717bddcd29dSMark Adams 
4718bddcd29dSMark Adams #define WARP_SIZE 32
4719bddcd29dSMark Adams template <typename T>
4720bddcd29dSMark Adams __forceinline__ __device__
4721bddcd29dSMark Adams T wreduce(T a)
4722bddcd29dSMark Adams {
4723bddcd29dSMark Adams   T b;
4724bddcd29dSMark Adams   #pragma unroll
4725bddcd29dSMark Adams   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4726bddcd29dSMark Adams     b = __shfl_down_sync(0xffffffff, a, i);
4727bddcd29dSMark Adams     a += b;
4728bddcd29dSMark Adams   }
4729bddcd29dSMark Adams   return a;
4730bddcd29dSMark Adams }
4731bddcd29dSMark Adams // reduce in a block, returns result in thread 0
4732bddcd29dSMark Adams template <typename T, int BLOCK_SIZE>
4733bddcd29dSMark Adams __device__
4734bddcd29dSMark Adams T breduce(T a)
4735bddcd29dSMark Adams {
4736bddcd29dSMark Adams   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4737bddcd29dSMark Adams   __shared__ double buf[NWARP];
4738bddcd29dSMark Adams   int wid = threadIdx.x / WARP_SIZE;
4739bddcd29dSMark Adams   int laneid = threadIdx.x % WARP_SIZE;
4740bddcd29dSMark Adams   T b = wreduce<T>(a);
4741bddcd29dSMark Adams   if (laneid == 0)
4742bddcd29dSMark Adams     buf[wid] = b;
4743bddcd29dSMark Adams   __syncthreads();
4744bddcd29dSMark Adams   if (wid == 0) {
4745bddcd29dSMark Adams     if (threadIdx.x < NWARP)
4746bddcd29dSMark Adams       a = buf[threadIdx.x];
4747bddcd29dSMark Adams     else
4748bddcd29dSMark Adams       a = 0;
4749bddcd29dSMark Adams     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4750bddcd29dSMark Adams       a += __shfl_down_sync(0xffffffff, a, i);
4751bddcd29dSMark Adams     }
4752bddcd29dSMark Adams   }
4753bddcd29dSMark Adams   return a;
4754bddcd29dSMark Adams }
4755bddcd29dSMark Adams 
4756bddcd29dSMark Adams 
4757bddcd29dSMark Adams // Band LU kernel ---  ba_csr bi_csr
4758bddcd29dSMark Adams template <int BLOCK_SIZE>
4759bddcd29dSMark Adams __global__
4760bddcd29dSMark Adams void __launch_bounds__(256,1)
4761bddcd29dSMark Adams mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4762bddcd29dSMark Adams {
4763bddcd29dSMark Adams   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4764bddcd29dSMark Adams   const PetscScalar *pLi;
4765bddcd29dSMark Adams   const int tid = threadIdx.x;
4766bddcd29dSMark Adams 
4767bddcd29dSMark Adams   /* Next, solve L */
4768bddcd29dSMark Adams   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4769bddcd29dSMark Adams   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4770bddcd29dSMark Adams     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4771bddcd29dSMark Adams     PetscScalar t = 0;
4772bddcd29dSMark Adams     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4773bddcd29dSMark Adams       t += pLi[idx]*x[j];
4774bddcd29dSMark Adams     }
4775bddcd29dSMark Adams #if defined(PETSC_USE_COMPLEX)
4776bddcd29dSMark Adams     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4777bddcd29dSMark Adams     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4778bddcd29dSMark Adams     t = tt;
4779bddcd29dSMark Adams #else
4780bddcd29dSMark Adams     t = breduce<PetscReal,BLOCK_SIZE>(t);
4781bddcd29dSMark Adams #endif
4782bddcd29dSMark Adams     if (threadIdx.x == 0)
4783bddcd29dSMark Adams       x[glbDD] -= t; // /1.0
4784bddcd29dSMark Adams     __syncthreads();
4785bddcd29dSMark Adams     // inc
4786bddcd29dSMark Adams     pLi += glbDD-col; // get to diagonal
4787bddcd29dSMark Adams     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4788bddcd29dSMark Adams     else pLi += bw;
4789bddcd29dSMark Adams     pLi += 1; // skip to next row
4790bddcd29dSMark Adams     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4791bddcd29dSMark Adams   }
4792bddcd29dSMark Adams   /* Then, solve U */
4793bddcd29dSMark Adams   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4794bddcd29dSMark Adams   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4795bddcd29dSMark Adams   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4796bddcd29dSMark Adams     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4797bddcd29dSMark Adams     PetscScalar t = 0;
4798bddcd29dSMark Adams     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4799bddcd29dSMark Adams       t += pLi[-idx]*x[j];
4800bddcd29dSMark Adams     }
4801bddcd29dSMark Adams #if defined(PETSC_USE_COMPLEX)
4802bddcd29dSMark Adams     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4803bddcd29dSMark Adams     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4804bddcd29dSMark Adams     t = tt;
4805bddcd29dSMark Adams #else
4806bddcd29dSMark Adams     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4807bddcd29dSMark Adams #endif
4808bddcd29dSMark Adams     pLi -= col-glbDD; // diagonal
4809bddcd29dSMark Adams     if (threadIdx.x == 0) {
4810bddcd29dSMark Adams       x[glbDD] -= t;
4811bddcd29dSMark Adams       x[glbDD] /= pLi[0];
4812bddcd29dSMark Adams     }
4813bddcd29dSMark Adams     __syncthreads();
4814bddcd29dSMark Adams     // inc past L to start of previous U
4815bddcd29dSMark Adams     pLi -= bw+1;
4816bddcd29dSMark Adams     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4817bddcd29dSMark Adams     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4818bddcd29dSMark Adams   }
4819bddcd29dSMark Adams }
4820bddcd29dSMark Adams 
4821bddcd29dSMark Adams static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4822bddcd29dSMark Adams {
4823bddcd29dSMark Adams   const PetscScalar                     *barray;
4824bddcd29dSMark Adams   PetscScalar                           *xarray;
4825bddcd29dSMark Adams   thrust::device_ptr<const PetscScalar> bGPU;
4826bddcd29dSMark Adams   thrust::device_ptr<PetscScalar>       xGPU;
4827bddcd29dSMark Adams   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4828bddcd29dSMark Adams   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4829bddcd29dSMark Adams   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4830bddcd29dSMark Adams   PetscErrorCode                        ierr;
4831bddcd29dSMark Adams   cudaError_t                           cerr;
4832bddcd29dSMark Adams   PetscContainer                        container;
4833bddcd29dSMark Adams 
4834bddcd29dSMark Adams   PetscFunctionBegin;
4835bddcd29dSMark Adams   if (A->rmap->n == 0) {
4836bddcd29dSMark Adams     PetscFunctionReturn(0);
4837bddcd29dSMark Adams   }
4838bddcd29dSMark Adams   // factor: get Nf if available
4839bddcd29dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4840bddcd29dSMark Adams   if (container) {
4841bddcd29dSMark Adams     PetscInt *pNf=NULL;
4842bddcd29dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4843bddcd29dSMark Adams     Nf = (*pNf)%1000;
4844bddcd29dSMark Adams   } else Nf = 1;
4845bddcd29dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4846bddcd29dSMark Adams 
4847bddcd29dSMark Adams   /* Get the GPU pointers */
4848bddcd29dSMark Adams   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
4849bddcd29dSMark Adams   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
4850bddcd29dSMark Adams   xGPU = thrust::device_pointer_cast(xarray);
4851bddcd29dSMark Adams   bGPU = thrust::device_pointer_cast(barray);
4852bddcd29dSMark Adams 
4853bddcd29dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4854bddcd29dSMark Adams   /* First, reorder with the row permutation */
4855bddcd29dSMark Adams   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4856bddcd29dSMark Adams                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4857bddcd29dSMark Adams                tempGPU->begin());
4858bddcd29dSMark Adams   constexpr int block = 128;
4859bddcd29dSMark Adams   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4860bddcd29dSMark Adams   CHECK_LAUNCH_ERROR(); // does a sync
4861bddcd29dSMark Adams 
4862bddcd29dSMark Adams   /* Last, reorder with the column permutation */
4863bddcd29dSMark Adams   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4864bddcd29dSMark Adams                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4865bddcd29dSMark Adams                xGPU);
4866bddcd29dSMark Adams 
4867bddcd29dSMark Adams   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
4868bddcd29dSMark Adams   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
4869bddcd29dSMark Adams   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4870bddcd29dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4871bddcd29dSMark Adams   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
4872bddcd29dSMark Adams   PetscFunctionReturn(0);
4873bddcd29dSMark Adams }
4874