xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 85ba735771a8b5cec8c45d73c93d760c26689b57)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
189ae82921SPaul Mullowney 
19087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22087f3262SPaul Mullowney 
236fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
246fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
256fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26087f3262SPaul Mullowney 
276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
306fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
314416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
326fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
336fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
346fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
356fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
399ae82921SPaul Mullowney 
407f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
467f756511SDominic Meiser 
47b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48b06137fdSPaul Mullowney {
49b06137fdSPaul Mullowney   cusparseStatus_t   stat;
50b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51b06137fdSPaul Mullowney 
52b06137fdSPaul Mullowney   PetscFunctionBegin;
53b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
5457d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55b06137fdSPaul Mullowney   PetscFunctionReturn(0);
56b06137fdSPaul Mullowney }
57b06137fdSPaul Mullowney 
58b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59b06137fdSPaul Mullowney {
60b06137fdSPaul Mullowney   cusparseStatus_t   stat;
61b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62b06137fdSPaul Mullowney 
63b06137fdSPaul Mullowney   PetscFunctionBegin;
646b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
6516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
6657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
6716a2e217SAlejandro Lamas Daviña     }
68b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
696b1cf21dSAlejandro Lamas Daviña   }
7057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71b06137fdSPaul Mullowney   PetscFunctionReturn(0);
72b06137fdSPaul Mullowney }
73b06137fdSPaul Mullowney 
74b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75b06137fdSPaul Mullowney {
76b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77ccdfe979SStefano Zampini 
78b06137fdSPaul Mullowney   PetscFunctionBegin;
79ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80b06137fdSPaul Mullowney   PetscFunctionReturn(0);
81b06137fdSPaul Mullowney }
82b06137fdSPaul Mullowney 
83ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
849ae82921SPaul Mullowney {
859ae82921SPaul Mullowney   PetscFunctionBegin;
869ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
879ae82921SPaul Mullowney   PetscFunctionReturn(0);
889ae82921SPaul Mullowney }
899ae82921SPaul Mullowney 
90c708e6cdSJed Brown /*MC
91087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
97c708e6cdSJed Brown 
989ae82921SPaul Mullowney   Level: beginner
99c708e6cdSJed Brown 
1003ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101c708e6cdSJed Brown M*/
1029ae82921SPaul Mullowney 
10342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1049ae82921SPaul Mullowney {
1059ae82921SPaul Mullowney   PetscErrorCode ierr;
106bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1079ae82921SPaul Mullowney 
1089ae82921SPaul Mullowney   PetscFunctionBegin;
109bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110404133a2SPaul Mullowney   (*B)->factortype = ftype;
111bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1129ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1132205254eSKarl Rupp 
114087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
11533d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1169ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1179ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1219ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122bc3f50f2SPaul Mullowney 
123fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1243ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1259ae82921SPaul Mullowney   PetscFunctionReturn(0);
1269ae82921SPaul Mullowney }
1279ae82921SPaul Mullowney 
128bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
129ca45077fSPaul Mullowney {
130aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1316e111a19SKarl Rupp 
132ca45077fSPaul Mullowney   PetscFunctionBegin;
133ca45077fSPaul Mullowney   switch (op) {
134e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
135aa372e3fSPaul Mullowney     cusparsestruct->format = format;
136ca45077fSPaul Mullowney     break;
137e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
138aa372e3fSPaul Mullowney     cusparsestruct->format = format;
139ca45077fSPaul Mullowney     break;
140ca45077fSPaul Mullowney   default:
14136d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
142ca45077fSPaul Mullowney   }
143ca45077fSPaul Mullowney   PetscFunctionReturn(0);
144ca45077fSPaul Mullowney }
1459ae82921SPaul Mullowney 
146e057df02SPaul Mullowney /*@
147e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
148e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
149aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
150e057df02SPaul Mullowney    Not Collective
151e057df02SPaul Mullowney 
152e057df02SPaul Mullowney    Input Parameters:
1538468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
15436d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1552692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
156e057df02SPaul Mullowney 
157e057df02SPaul Mullowney    Output Parameter:
158e057df02SPaul Mullowney 
159e057df02SPaul Mullowney    Level: intermediate
160e057df02SPaul Mullowney 
1618468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
162e057df02SPaul Mullowney @*/
163e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
164e057df02SPaul Mullowney {
165e057df02SPaul Mullowney   PetscErrorCode ierr;
1666e111a19SKarl Rupp 
167e057df02SPaul Mullowney   PetscFunctionBegin;
168e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
170e057df02SPaul Mullowney   PetscFunctionReturn(0);
171e057df02SPaul Mullowney }
172e057df02SPaul Mullowney 
173e6e9a74fSStefano Zampini /*@
174e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
175e6e9a74fSStefano Zampini 
176e6e9a74fSStefano Zampini    Collective on mat
177e6e9a74fSStefano Zampini 
178e6e9a74fSStefano Zampini    Input Parameters:
179e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
180e6e9a74fSStefano Zampini -  transgen - the boolean flag
181e6e9a74fSStefano Zampini 
182e6e9a74fSStefano Zampini    Level: intermediate
183e6e9a74fSStefano Zampini 
184e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
185e6e9a74fSStefano Zampini @*/
186e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
187e6e9a74fSStefano Zampini {
188e6e9a74fSStefano Zampini   PetscErrorCode ierr;
189e6e9a74fSStefano Zampini   PetscBool      flg;
190e6e9a74fSStefano Zampini 
191e6e9a74fSStefano Zampini   PetscFunctionBegin;
192e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
193e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
194e6e9a74fSStefano Zampini   if (flg) {
195e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
196e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
197e6e9a74fSStefano Zampini     cusp->transgen = transgen;
198e6e9a74fSStefano Zampini   }
199e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
200e6e9a74fSStefano Zampini }
201e6e9a74fSStefano Zampini 
2024416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2039ae82921SPaul Mullowney {
2049ae82921SPaul Mullowney   PetscErrorCode           ierr;
205e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2069ae82921SPaul Mullowney   PetscBool                flg;
207a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2086e111a19SKarl Rupp 
2099ae82921SPaul Mullowney   PetscFunctionBegin;
210e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2119ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
212e6e9a74fSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr);
213e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
214a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
215e057df02SPaul Mullowney     if (flg) {
216e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
217045c96e1SPaul Mullowney     }
2189ae82921SPaul Mullowney   }
2194c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
220a183c035SDominic Meiser                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
2214c87dfd4SPaul Mullowney   if (flg) {
2224c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
2234c87dfd4SPaul Mullowney   }
2240af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2259ae82921SPaul Mullowney   PetscFunctionReturn(0);
2269ae82921SPaul Mullowney }
2279ae82921SPaul Mullowney 
2286fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2299ae82921SPaul Mullowney {
2309ae82921SPaul Mullowney   PetscErrorCode ierr;
2319ae82921SPaul Mullowney 
2329ae82921SPaul Mullowney   PetscFunctionBegin;
2339ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2349ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2359ae82921SPaul Mullowney   PetscFunctionReturn(0);
2369ae82921SPaul Mullowney }
2379ae82921SPaul Mullowney 
2386fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2399ae82921SPaul Mullowney {
2409ae82921SPaul Mullowney   PetscErrorCode ierr;
2419ae82921SPaul Mullowney 
2429ae82921SPaul Mullowney   PetscFunctionBegin;
2439ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2449ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2459ae82921SPaul Mullowney   PetscFunctionReturn(0);
2469ae82921SPaul Mullowney }
2479ae82921SPaul Mullowney 
248087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249087f3262SPaul Mullowney {
250087f3262SPaul Mullowney   PetscErrorCode ierr;
251087f3262SPaul Mullowney 
252087f3262SPaul Mullowney   PetscFunctionBegin;
253087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255087f3262SPaul Mullowney   PetscFunctionReturn(0);
256087f3262SPaul Mullowney }
257087f3262SPaul Mullowney 
258087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259087f3262SPaul Mullowney {
260087f3262SPaul Mullowney   PetscErrorCode ierr;
261087f3262SPaul Mullowney 
262087f3262SPaul Mullowney   PetscFunctionBegin;
263087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265087f3262SPaul Mullowney   PetscFunctionReturn(0);
266087f3262SPaul Mullowney }
267087f3262SPaul Mullowney 
268087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2699ae82921SPaul Mullowney {
2709ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2719ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2729ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
273aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2749ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2759ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2769ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2779ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2789ae82921SPaul Mullowney   PetscScalar                       *AALo;
2799ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
280b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
28157d48284SJunchao Zhang   cudaError_t                       cerr;
2829ae82921SPaul Mullowney 
2839ae82921SPaul Mullowney   PetscFunctionBegin;
284cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
285c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2869ae82921SPaul Mullowney     try {
2879ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2889ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
29157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
29257d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
29357d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
2949ae82921SPaul Mullowney 
2959ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2969ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2979ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2989ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2999ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
3009ae82921SPaul Mullowney       v        = aa;
3019ae82921SPaul Mullowney       vi       = aj;
3029ae82921SPaul Mullowney       offset   = 1;
3039ae82921SPaul Mullowney       rowOffset= 1;
3049ae82921SPaul Mullowney       for (i=1; i<n; i++) {
3059ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
306e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
3079ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
3089ae82921SPaul Mullowney         rowOffset += nz+1;
3099ae82921SPaul Mullowney 
310580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
311580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3129ae82921SPaul Mullowney 
3139ae82921SPaul Mullowney         offset      += nz;
3149ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3159ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3169ae82921SPaul Mullowney         offset      += 1;
3179ae82921SPaul Mullowney 
3189ae82921SPaul Mullowney         v  += nz;
3199ae82921SPaul Mullowney         vi += nz;
3209ae82921SPaul Mullowney       }
3212205254eSKarl Rupp 
322aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
323aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3242205254eSKarl Rupp 
325aa372e3fSPaul Mullowney       /* Create the matrix description */
32657d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
32757d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
32857d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
32957d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
33057d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
331aa372e3fSPaul Mullowney 
332aa372e3fSPaul Mullowney       /* Create the solve analysis information */
33357d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
334aa372e3fSPaul Mullowney 
335aa372e3fSPaul Mullowney       /* set the operation */
336aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
337aa372e3fSPaul Mullowney 
338aa372e3fSPaul Mullowney       /* set the matrix */
339aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
340aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
341aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
342aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
343aa372e3fSPaul Mullowney 
344aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
345aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
346aa372e3fSPaul Mullowney 
347aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
348aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
349aa372e3fSPaul Mullowney 
350aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
351aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
352aa372e3fSPaul Mullowney 
353aa372e3fSPaul Mullowney       /* perform the solve analysis */
354aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
355aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
35757d48284SJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
358aa372e3fSPaul Mullowney 
359aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
360aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3612205254eSKarl Rupp 
36257d48284SJunchao Zhang       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
36357d48284SJunchao Zhang       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
36457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
3654863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
3669ae82921SPaul Mullowney     } catch(char *ex) {
3679ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3689ae82921SPaul Mullowney     }
3699ae82921SPaul Mullowney   }
3709ae82921SPaul Mullowney   PetscFunctionReturn(0);
3719ae82921SPaul Mullowney }
3729ae82921SPaul Mullowney 
373087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3749ae82921SPaul Mullowney {
3759ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3769ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3799ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3809ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3819ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3829ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3839ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3849ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3859ae82921SPaul Mullowney   PetscErrorCode                    ierr;
38657d48284SJunchao Zhang   cudaError_t                       cerr;
3879ae82921SPaul Mullowney 
3889ae82921SPaul Mullowney   PetscFunctionBegin;
389cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
390c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3919ae82921SPaul Mullowney     try {
3929ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3939ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
39657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
39757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
39857d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
3999ae82921SPaul Mullowney 
4009ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
4019ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
4029ae82921SPaul Mullowney       AiUp[n]=nzUpper;
4039ae82921SPaul Mullowney       offset = nzUpper;
4049ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
4059ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
4069ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
4079ae82921SPaul Mullowney 
408e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
4099ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
4109ae82921SPaul Mullowney 
411e057df02SPaul Mullowney         /* decrement the offset */
4129ae82921SPaul Mullowney         offset -= (nz+1);
4139ae82921SPaul Mullowney 
414e057df02SPaul Mullowney         /* first, set the diagonal elements */
4159ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
41609f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
4179ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4189ae82921SPaul Mullowney 
419580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
420580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
4219ae82921SPaul Mullowney       }
4222205254eSKarl Rupp 
423aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
424aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4252205254eSKarl Rupp 
426aa372e3fSPaul Mullowney       /* Create the matrix description */
42757d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
42857d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
42957d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
43057d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
43157d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
432aa372e3fSPaul Mullowney 
433aa372e3fSPaul Mullowney       /* Create the solve analysis information */
43457d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
435aa372e3fSPaul Mullowney 
436aa372e3fSPaul Mullowney       /* set the operation */
437aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438aa372e3fSPaul Mullowney 
439aa372e3fSPaul Mullowney       /* set the matrix */
440aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
441aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
442aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
443aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
444aa372e3fSPaul Mullowney 
445aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
447aa372e3fSPaul Mullowney 
448aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
449aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
450aa372e3fSPaul Mullowney 
451aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
452aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
453aa372e3fSPaul Mullowney 
454aa372e3fSPaul Mullowney       /* perform the solve analysis */
455aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
456aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
457aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
45857d48284SJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459aa372e3fSPaul Mullowney 
460aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
461aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4622205254eSKarl Rupp 
46357d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
46457d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
46557d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
4664863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
4679ae82921SPaul Mullowney     } catch(char *ex) {
4689ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4699ae82921SPaul Mullowney     }
4709ae82921SPaul Mullowney   }
4719ae82921SPaul Mullowney   PetscFunctionReturn(0);
4729ae82921SPaul Mullowney }
4739ae82921SPaul Mullowney 
474087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4759ae82921SPaul Mullowney {
4769ae82921SPaul Mullowney   PetscErrorCode               ierr;
4779ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4799ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4809ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4819ae82921SPaul Mullowney   const PetscInt               *r,*c;
4829ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4839ae82921SPaul Mullowney 
4849ae82921SPaul Mullowney   PetscFunctionBegin;
485ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
486087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
487087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4882205254eSKarl Rupp 
489e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
490aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
4919ae82921SPaul Mullowney 
492c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
493e057df02SPaul Mullowney   /* lower triangular indices */
4949ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4959ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4962205254eSKarl Rupp   if (!row_identity) {
497aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
498aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
4992205254eSKarl Rupp   }
5009ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5019ae82921SPaul Mullowney 
502e057df02SPaul Mullowney   /* upper triangular indices */
5039ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
5049ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5052205254eSKarl Rupp   if (!col_identity) {
506aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
507aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
5082205254eSKarl Rupp   }
5094863603aSSatish Balay 
5104863603aSSatish Balay   if (!row_identity && !col_identity) {
5114863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
5124863603aSSatish Balay   } else if(!row_identity) {
5134863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
5144863603aSSatish Balay   } else if(!col_identity) {
5154863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
5164863603aSSatish Balay   }
5174863603aSSatish Balay 
5189ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
5199ae82921SPaul Mullowney   PetscFunctionReturn(0);
5209ae82921SPaul Mullowney }
5219ae82921SPaul Mullowney 
522087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
523087f3262SPaul Mullowney {
524087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
525087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
526aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
527aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
528087f3262SPaul Mullowney   cusparseStatus_t                  stat;
529087f3262SPaul Mullowney   PetscErrorCode                    ierr;
53057d48284SJunchao Zhang   cudaError_t                       cerr;
531087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
532087f3262SPaul Mullowney   PetscScalar                       *AAUp;
533087f3262SPaul Mullowney   PetscScalar                       *AALo;
534087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
535087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
536087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
537087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
538087f3262SPaul Mullowney 
539087f3262SPaul Mullowney   PetscFunctionBegin;
540cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
541c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
542087f3262SPaul Mullowney     try {
543087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
54457d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
54557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
54657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
54757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
548087f3262SPaul Mullowney 
549087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
550087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
551087f3262SPaul Mullowney       AiUp[n]=nzUpper;
552087f3262SPaul Mullowney       offset = 0;
553087f3262SPaul Mullowney       for (i=0; i<n; i++) {
554087f3262SPaul Mullowney         /* set the pointers */
555087f3262SPaul Mullowney         v  = aa + ai[i];
556087f3262SPaul Mullowney         vj = aj + ai[i];
557087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
558087f3262SPaul Mullowney 
559087f3262SPaul Mullowney         /* first, set the diagonal elements */
560087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
56109f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
562087f3262SPaul Mullowney         AiUp[i]      = offset;
56309f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
564087f3262SPaul Mullowney 
565087f3262SPaul Mullowney         offset+=1;
566087f3262SPaul Mullowney         if (nz>0) {
567f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
568580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
569087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
570087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
571087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
572087f3262SPaul Mullowney           }
573087f3262SPaul Mullowney           offset+=nz;
574087f3262SPaul Mullowney         }
575087f3262SPaul Mullowney       }
576087f3262SPaul Mullowney 
577aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
578aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
579087f3262SPaul Mullowney 
580aa372e3fSPaul Mullowney       /* Create the matrix description */
58157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
58257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
58357d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
58457d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
58557d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
586087f3262SPaul Mullowney 
587aa372e3fSPaul Mullowney       /* Create the solve analysis information */
58857d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
589aa372e3fSPaul Mullowney 
590aa372e3fSPaul Mullowney       /* set the operation */
591aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney       /* set the matrix */
594aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
595aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
596aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
597aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
598aa372e3fSPaul Mullowney 
599aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
600aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
601aa372e3fSPaul Mullowney 
602aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
603aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
604aa372e3fSPaul Mullowney 
605aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
606aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
607aa372e3fSPaul Mullowney 
608aa372e3fSPaul Mullowney       /* perform the solve analysis */
609aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
61257d48284SJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
613aa372e3fSPaul Mullowney 
614aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
615aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
616aa372e3fSPaul Mullowney 
617aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
618aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
619aa372e3fSPaul Mullowney 
620aa372e3fSPaul Mullowney       /* Create the matrix description */
62157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
62257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
62357d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
62457d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
62557d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
626aa372e3fSPaul Mullowney 
627aa372e3fSPaul Mullowney       /* Create the solve analysis information */
62857d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
629aa372e3fSPaul Mullowney 
630aa372e3fSPaul Mullowney       /* set the operation */
631aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
632aa372e3fSPaul Mullowney 
633aa372e3fSPaul Mullowney       /* set the matrix */
634aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
635aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
636aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
637aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
638aa372e3fSPaul Mullowney 
639aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
640aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
641aa372e3fSPaul Mullowney 
642aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
643aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
644aa372e3fSPaul Mullowney 
645aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
646aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
6474863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
648aa372e3fSPaul Mullowney 
649aa372e3fSPaul Mullowney       /* perform the solve analysis */
650aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
651aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
652aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
65357d48284SJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
654aa372e3fSPaul Mullowney 
655aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
656aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
657087f3262SPaul Mullowney 
658c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
65957d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
66057d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
66157d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
66257d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
663087f3262SPaul Mullowney     } catch(char *ex) {
664087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
665087f3262SPaul Mullowney     }
666087f3262SPaul Mullowney   }
667087f3262SPaul Mullowney   PetscFunctionReturn(0);
668087f3262SPaul Mullowney }
669087f3262SPaul Mullowney 
670087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6719ae82921SPaul Mullowney {
6729ae82921SPaul Mullowney   PetscErrorCode               ierr;
673087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
674087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
675087f3262SPaul Mullowney   IS                           ip = a->row;
676087f3262SPaul Mullowney   const PetscInt               *rip;
677087f3262SPaul Mullowney   PetscBool                    perm_identity;
678087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
679087f3262SPaul Mullowney 
680087f3262SPaul Mullowney   PetscFunctionBegin;
681ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
682087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
683e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
684aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
685aa372e3fSPaul Mullowney 
686087f3262SPaul Mullowney   /* lower triangular indices */
687087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
688087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
689087f3262SPaul Mullowney   if (!perm_identity) {
6904e4bbfaaSStefano Zampini     IS             iip;
6914e4bbfaaSStefano Zampini     const PetscInt *irip;
6924e4bbfaaSStefano Zampini 
6934e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
6944e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
695aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
696aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
697aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
6984e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
6994e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
7004e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
7014863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
702087f3262SPaul Mullowney   }
703087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
704087f3262SPaul Mullowney   PetscFunctionReturn(0);
705087f3262SPaul Mullowney }
706087f3262SPaul Mullowney 
7076fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
7089ae82921SPaul Mullowney {
7099ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
7109ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
7119ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
712b175d8bbSPaul Mullowney   PetscErrorCode ierr;
7139ae82921SPaul Mullowney 
7149ae82921SPaul Mullowney   PetscFunctionBegin;
7159ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
716ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
717e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
7189ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
7199ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
720bda325fcSPaul Mullowney   if (row_identity && col_identity) {
721bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
722bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
7234e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7244e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
725bda325fcSPaul Mullowney   } else {
726bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
7284e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7294e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
730bda325fcSPaul Mullowney   }
7318dc1d2a3SPaul Mullowney 
732e057df02SPaul Mullowney   /* get the triangular factors */
733087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
7349ae82921SPaul Mullowney   PetscFunctionReturn(0);
7359ae82921SPaul Mullowney }
7369ae82921SPaul Mullowney 
737087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
738087f3262SPaul Mullowney {
739087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
740087f3262SPaul Mullowney   IS             ip = b->row;
741087f3262SPaul Mullowney   PetscBool      perm_identity;
742b175d8bbSPaul Mullowney   PetscErrorCode ierr;
743087f3262SPaul Mullowney 
744087f3262SPaul Mullowney   PetscFunctionBegin;
745087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
746ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
747087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
748087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
749087f3262SPaul Mullowney   if (perm_identity) {
750087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
751087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
7524e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7534e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
754087f3262SPaul Mullowney   } else {
755087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
756087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
7574e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7584e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
759087f3262SPaul Mullowney   }
760087f3262SPaul Mullowney 
761087f3262SPaul Mullowney   /* get the triangular factors */
762087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
763087f3262SPaul Mullowney   PetscFunctionReturn(0);
764087f3262SPaul Mullowney }
7659ae82921SPaul Mullowney 
766b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
767bda325fcSPaul Mullowney {
768bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
770aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
771aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
772aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
773bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
774aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
775aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
776aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
777aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
778b175d8bbSPaul Mullowney 
779bda325fcSPaul Mullowney   PetscFunctionBegin;
780bda325fcSPaul Mullowney 
781aa372e3fSPaul Mullowney   /*********************************************/
782aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
783aa372e3fSPaul Mullowney   /*********************************************/
784aa372e3fSPaul Mullowney 
785aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
786aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
787aa372e3fSPaul Mullowney 
788aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
789aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
790aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
791aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
792aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
793aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
794aa372e3fSPaul Mullowney 
795aa372e3fSPaul Mullowney   /* Create the matrix description */
79657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
79757d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
79857d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
79957d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
80057d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
801aa372e3fSPaul Mullowney 
802aa372e3fSPaul Mullowney   /* Create the solve analysis information */
80357d48284SJunchao Zhang   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
804aa372e3fSPaul Mullowney 
805aa372e3fSPaul Mullowney   /* set the operation */
806aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
807aa372e3fSPaul Mullowney 
808aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
809aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
810aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
811aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
812aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
813aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
814aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
815aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
816aa372e3fSPaul Mullowney 
817aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
818aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
819aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
820aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
821aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
822aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
823aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
824aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
825aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
82657d48284SJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
827aa372e3fSPaul Mullowney 
828aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
829aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
830aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
831aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
832aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
83357d48284SJunchao Zhang                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
834aa372e3fSPaul Mullowney 
835aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
836aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
837aa372e3fSPaul Mullowney 
838aa372e3fSPaul Mullowney   /*********************************************/
839aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
840aa372e3fSPaul Mullowney   /*********************************************/
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
843aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
846aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
847aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
848aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
849aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
850aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
851aa372e3fSPaul Mullowney 
852aa372e3fSPaul Mullowney   /* Create the matrix description */
85357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
85457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
85557d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
85657d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
85757d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
858aa372e3fSPaul Mullowney 
859aa372e3fSPaul Mullowney   /* Create the solve analysis information */
86057d48284SJunchao Zhang   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
861aa372e3fSPaul Mullowney 
862aa372e3fSPaul Mullowney   /* set the operation */
863aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
864aa372e3fSPaul Mullowney 
865aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
866aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
867aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
868aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
869aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
870aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
871aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
872aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
873aa372e3fSPaul Mullowney 
874aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
875aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
876aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
877aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
878aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
879aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
880aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
881aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
882aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
88357d48284SJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
884aa372e3fSPaul Mullowney 
885aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
886aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
887aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
888aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
889aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
89057d48284SJunchao Zhang                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
891aa372e3fSPaul Mullowney 
892aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
893aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
894bda325fcSPaul Mullowney   PetscFunctionReturn(0);
895bda325fcSPaul Mullowney }
896bda325fcSPaul Mullowney 
897b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
898bda325fcSPaul Mullowney {
899aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
900aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
901aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
902bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
903bda325fcSPaul Mullowney   cusparseStatus_t             stat;
904aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
905b06137fdSPaul Mullowney   cudaError_t                  err;
906*85ba7357SStefano Zampini   PetscErrorCode               ierr;
907b175d8bbSPaul Mullowney 
908bda325fcSPaul Mullowney   PetscFunctionBegin;
909*85ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
910*85ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
911*85ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
912*85ba7357SStefano Zampini   /* create cusparse matrix */
913aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
91457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
915aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
91657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
91757d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
918aa372e3fSPaul Mullowney 
919b06137fdSPaul Mullowney   /* set alpha and beta */
920c41cb2e2SAlejandro Lamas Daviña   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
9217656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
9227656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
9237656d835SStefano Zampini   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
9247656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
9257656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
92657d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
927b06137fdSPaul Mullowney 
928aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
929aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
930aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
931554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
932554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
933aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
934a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
935aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
936aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
937a3fdcf43SKarl Rupp 
93881902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
93981902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
94081902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
941aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
942a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
943a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
944aa372e3fSPaul Mullowney                             matrix->values->data().get(),
94581902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
946aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
947aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
948aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
949aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
95057d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
951aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
952aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
953aa372e3fSPaul Mullowney     CsrMatrix *temp = new CsrMatrix;
95451c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
95551c6d536SStefano Zampini 
95651c6d536SStefano Zampini     /* First convert HYB to CSR */
957aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
958aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
959aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
960aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
961aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
962aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
963aa372e3fSPaul Mullowney 
964aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
965aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
966aa372e3fSPaul Mullowney                             temp->values->data().get(),
967aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
96857d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
969aa372e3fSPaul Mullowney 
970aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
971aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
972aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
973aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
974aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
975aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
976aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
977aa372e3fSPaul Mullowney 
978aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
979aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
980aa372e3fSPaul Mullowney                             temp->values->data().get(),
981aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
982aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
983aa372e3fSPaul Mullowney                             tempT->values->data().get(),
984aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
985aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
98657d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
987aa372e3fSPaul Mullowney 
988aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
989aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
99057d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
991aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
992aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
993aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
994aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
995aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
996aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
99757d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
998aa372e3fSPaul Mullowney 
999aa372e3fSPaul Mullowney     /* assign the pointer */
1000aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1001aa372e3fSPaul Mullowney     /* delete temporaries */
1002aa372e3fSPaul Mullowney     if (tempT) {
1003aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1004aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1005aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1006aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1007087f3262SPaul Mullowney     }
1008aa372e3fSPaul Mullowney     if (temp) {
1009aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1010aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1011aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1012aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1013aa372e3fSPaul Mullowney     }
1014aa372e3fSPaul Mullowney   }
1015*85ba7357SStefano Zampini   err  = WaitForGPU();CHKERRCUDA(err);
1016*85ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1017*85ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1018213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1019213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1020aa372e3fSPaul Mullowney   /* assign the pointer */
1021aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1022bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1023bda325fcSPaul Mullowney }
1024bda325fcSPaul Mullowney 
10254e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
10266fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1027bda325fcSPaul Mullowney {
1028c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1029465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1030465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1031465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1032465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1033bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1034bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1035aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1036aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1037aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1038b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
103957d48284SJunchao Zhang   cudaError_t                           cerr;
1040bda325fcSPaul Mullowney 
1041bda325fcSPaul Mullowney   PetscFunctionBegin;
1042aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1043aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1044bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1045aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047bda325fcSPaul Mullowney   }
1048bda325fcSPaul Mullowney 
1049bda325fcSPaul Mullowney   /* Get the GPU pointers */
1050c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1051c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1052c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1053c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1054bda325fcSPaul Mullowney 
10557a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1056aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1057c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1058c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1059c41cb2e2SAlejandro Lamas Daviña                xGPU);
1060aa372e3fSPaul Mullowney 
1061aa372e3fSPaul Mullowney   /* First, solve U */
1062aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
10637656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1064aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1065aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1066aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1067aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
106857d48284SJunchao Zhang                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* Then, solve L */
1071aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
10727656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1073aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1074aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1075aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1076aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
107757d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1078aa372e3fSPaul Mullowney 
1079aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1080c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1081c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1082aa372e3fSPaul Mullowney                tempGPU->begin());
1083aa372e3fSPaul Mullowney 
1084aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1085c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1086bda325fcSPaul Mullowney 
1087bda325fcSPaul Mullowney   /* restore */
1088c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1089c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
109057d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1091661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1092958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1093bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1094bda325fcSPaul Mullowney }
1095bda325fcSPaul Mullowney 
10966fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1097bda325fcSPaul Mullowney {
1098465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1099465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1100bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1101bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1102aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1103aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1104aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1105b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
110657d48284SJunchao Zhang   cudaError_t                       cerr;
1107bda325fcSPaul Mullowney 
1108bda325fcSPaul Mullowney   PetscFunctionBegin;
1109aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1110aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1111bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1112aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1113aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1114bda325fcSPaul Mullowney   }
1115bda325fcSPaul Mullowney 
1116bda325fcSPaul Mullowney   /* Get the GPU pointers */
1117c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1118c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1119bda325fcSPaul Mullowney 
11207a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1121aa372e3fSPaul Mullowney   /* First, solve U */
1122aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
11237656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1124aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1125aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1126aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1127aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
112857d48284SJunchao Zhang                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1129aa372e3fSPaul Mullowney 
1130aa372e3fSPaul Mullowney   /* Then, solve L */
1131aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
11327656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1133aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1134aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1135aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1136aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
113757d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1138bda325fcSPaul Mullowney 
1139bda325fcSPaul Mullowney   /* restore */
1140c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1141c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
114257d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1143661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1144958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1145bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1146bda325fcSPaul Mullowney }
1147bda325fcSPaul Mullowney 
11486fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
11499ae82921SPaul Mullowney {
1150465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1151465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1152465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1153465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
11549ae82921SPaul Mullowney   cusparseStatus_t                      stat;
11559ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1156aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1157aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1158aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1159b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
116057d48284SJunchao Zhang   cudaError_t                           cerr;
11619ae82921SPaul Mullowney 
11629ae82921SPaul Mullowney   PetscFunctionBegin;
1163ebc8f436SDominic Meiser 
1164e057df02SPaul Mullowney   /* Get the GPU pointers */
1165c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1166c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1167c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1168c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
11699ae82921SPaul Mullowney 
11707a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1171aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1172c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1173c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
11744e4bbfaaSStefano Zampini                tempGPU->begin());
1175aa372e3fSPaul Mullowney 
1176aa372e3fSPaul Mullowney   /* Next, solve L */
1177aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
11787656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1179aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1180aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1181aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1182aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
118357d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1184aa372e3fSPaul Mullowney 
1185aa372e3fSPaul Mullowney   /* Then, solve U */
1186aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
11877656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1188aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1189aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1190aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1191aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
119257d48284SJunchao Zhang                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1193aa372e3fSPaul Mullowney 
11944e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
11954e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
11964e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
11974e4bbfaaSStefano Zampini                xGPU);
11989ae82921SPaul Mullowney 
1199c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1200c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
120157d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1202661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1203958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12049ae82921SPaul Mullowney   PetscFunctionReturn(0);
12059ae82921SPaul Mullowney }
12069ae82921SPaul Mullowney 
12076fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
12089ae82921SPaul Mullowney {
1209465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1210465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
12119ae82921SPaul Mullowney   cusparseStatus_t                  stat;
12129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1213aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1214aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1215aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1216b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
121757d48284SJunchao Zhang   cudaError_t                       cerr;
12189ae82921SPaul Mullowney 
12199ae82921SPaul Mullowney   PetscFunctionBegin;
1220e057df02SPaul Mullowney   /* Get the GPU pointers */
1221c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1222c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
12239ae82921SPaul Mullowney 
12247a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225aa372e3fSPaul Mullowney   /* First, solve L */
1226aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
12277656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1228aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1229aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1230aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1231aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
123257d48284SJunchao Zhang                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1233aa372e3fSPaul Mullowney 
1234aa372e3fSPaul Mullowney   /* Next, solve U */
1235aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
12367656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1237aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1238aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1239aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1240aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
124157d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
12429ae82921SPaul Mullowney 
1243c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1244c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
124557d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1246661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1247958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12489ae82921SPaul Mullowney   PetscFunctionReturn(0);
12499ae82921SPaul Mullowney }
12509ae82921SPaul Mullowney 
12516fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
12529ae82921SPaul Mullowney {
1253aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
12547c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
12559ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1256213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
12579ae82921SPaul Mullowney   PetscErrorCode               ierr;
1258aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1259b06137fdSPaul Mullowney   cudaError_t                  err;
12609ae82921SPaul Mullowney 
12619ae82921SPaul Mullowney   PetscFunctionBegin;
126295639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1263c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
126481902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
126581902715SJunchao Zhang       /* Copy values only */
126681902715SJunchao Zhang       CsrMatrix *mat,*matT;
126781902715SJunchao Zhang       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1268*85ba7357SStefano Zampini 
1269*85ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1270*85ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
127181902715SJunchao Zhang       mat->values->assign(a->a, a->a+a->nz);
1272*85ba7357SStefano Zampini       err  = WaitForGPU();CHKERRCUDA(err);
1273*85ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
12744863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1275*85ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
127681902715SJunchao Zhang 
127781902715SJunchao Zhang       /* Update matT when it was built before */
127881902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
127981902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
128081902715SJunchao Zhang         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1281*85ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1282*85ba7357SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
128381902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
128481902715SJunchao Zhang                                  A->cmap->n, mat->num_entries,
128581902715SJunchao Zhang                                  mat->values->data().get(),
128681902715SJunchao Zhang                                  cusparsestruct->rowoffsets_gpu->data().get(),
128781902715SJunchao Zhang                                  mat->column_indices->data().get(),
128881902715SJunchao Zhang                                  matT->values->data().get(),
128981902715SJunchao Zhang                                  matT->column_indices->data().get(),
129081902715SJunchao Zhang                                  matT->row_offsets->data().get(),
12919f74ec24SSatish Balay                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1292*85ba7357SStefano Zampini         err  = WaitForGPU();CHKERRCUDA(err);
1293*85ba7357SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1294*85ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
129581902715SJunchao Zhang       }
129634d6c7a5SJose E. Roman     } else {
1297*85ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1298*85ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12997c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
13007c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
13017c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
130281902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
13039ae82921SPaul Mullowney       try {
13049ae82921SPaul Mullowney         if (a->compressedrow.use) {
13059ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
13069ae82921SPaul Mullowney           ii   = a->compressedrow.i;
13079ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
13089ae82921SPaul Mullowney         } else {
1309213423ffSJunchao Zhang           m    = A->rmap->n;
1310213423ffSJunchao Zhang           ii   = a->i;
1311e6e9a74fSStefano Zampini           ridx = NULL;
13129ae82921SPaul Mullowney         }
1313213423ffSJunchao Zhang         cusparsestruct->nrows = m;
13149ae82921SPaul Mullowney 
1315*85ba7357SStefano Zampini         /* create cusparse matrix */
1316aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
131757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
131857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
131957d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
13209ae82921SPaul Mullowney 
1321c41cb2e2SAlejandro Lamas Daviña         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
13227656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
13237656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
13247656d835SStefano Zampini         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13257656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13267656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
132757d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1328b06137fdSPaul Mullowney 
1329aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1330aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1331aa372e3fSPaul Mullowney           /* set the matrix */
1332aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1333a65300a6SPaul Mullowney           matrix->num_rows = m;
1334aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1335aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1336a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1337a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
13389ae82921SPaul Mullowney 
1339aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1340aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1341aa372e3fSPaul Mullowney 
1342aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1343aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1344aa372e3fSPaul Mullowney 
1345aa372e3fSPaul Mullowney           /* assign the pointer */
1346aa372e3fSPaul Mullowney           matstruct->mat = matrix;
1347aa372e3fSPaul Mullowney 
1348aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1349aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1350a65300a6SPaul Mullowney           matrix->num_rows = m;
1351aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1352aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1353a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1354a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
1355aa372e3fSPaul Mullowney 
1356aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1357aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1358aa372e3fSPaul Mullowney 
1359aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1360aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1361aa372e3fSPaul Mullowney 
1362aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
136357d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1364aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1365aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1366a65300a6SPaul Mullowney           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1367aa372e3fSPaul Mullowney               matstruct->descr, matrix->values->data().get(),
1368aa372e3fSPaul Mullowney               matrix->row_offsets->data().get(),
1369aa372e3fSPaul Mullowney               matrix->column_indices->data().get(),
137057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1371aa372e3fSPaul Mullowney           /* assign the pointer */
1372aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1373aa372e3fSPaul Mullowney 
1374aa372e3fSPaul Mullowney           if (matrix) {
1375aa372e3fSPaul Mullowney             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1376aa372e3fSPaul Mullowney             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1377aa372e3fSPaul Mullowney             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1378aa372e3fSPaul Mullowney             delete (CsrMatrix*)matrix;
1379087f3262SPaul Mullowney           }
1380087f3262SPaul Mullowney         }
1381ca45077fSPaul Mullowney 
1382aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1383213423ffSJunchao Zhang         if (a->compressedrow.use) {
1384213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1385aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1386aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1387213423ffSJunchao Zhang           tmp = m;
1388213423ffSJunchao Zhang         } else {
1389213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1390213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1391213423ffSJunchao Zhang           tmp = 0;
1392213423ffSJunchao Zhang         }
1393213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1394aa372e3fSPaul Mullowney 
1395aa372e3fSPaul Mullowney         /* assign the pointer */
1396aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
13979ae82921SPaul Mullowney       } catch(char *ex) {
13989ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
13999ae82921SPaul Mullowney       }
1400*85ba7357SStefano Zampini       err  = WaitForGPU();CHKERRCUDA(err);
1401*85ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1402*85ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
140334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
140434d6c7a5SJose E. Roman     }
1405c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
14069ae82921SPaul Mullowney   }
14079ae82921SPaul Mullowney   PetscFunctionReturn(0);
14089ae82921SPaul Mullowney }
14099ae82921SPaul Mullowney 
1410c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1411aa372e3fSPaul Mullowney {
1412aa372e3fSPaul Mullowney   template <typename Tuple>
1413aa372e3fSPaul Mullowney   __host__ __device__
1414aa372e3fSPaul Mullowney   void operator()(Tuple t)
1415aa372e3fSPaul Mullowney   {
1416aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1417aa372e3fSPaul Mullowney   }
1418aa372e3fSPaul Mullowney };
1419aa372e3fSPaul Mullowney 
1420e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1421e6e9a74fSStefano Zampini {
1422e6e9a74fSStefano Zampini   template <typename Tuple>
1423e6e9a74fSStefano Zampini   __host__ __device__
1424e6e9a74fSStefano Zampini   void operator()(Tuple t)
1425e6e9a74fSStefano Zampini   {
1426e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1427e6e9a74fSStefano Zampini   }
1428e6e9a74fSStefano Zampini };
1429e6e9a74fSStefano Zampini 
1430ccdfe979SStefano Zampini typedef struct {
1431ccdfe979SStefano Zampini   PetscBool   cisdense;
1432ccdfe979SStefano Zampini   PetscScalar *Bt;
1433ccdfe979SStefano Zampini   Mat         X;
1434ccdfe979SStefano Zampini } MatMatCusparse;
1435ccdfe979SStefano Zampini 
1436ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1437ccdfe979SStefano Zampini {
1438ccdfe979SStefano Zampini   PetscErrorCode ierr;
1439ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1440ccdfe979SStefano Zampini   cudaError_t    cerr;
1441ccdfe979SStefano Zampini 
1442ccdfe979SStefano Zampini   PetscFunctionBegin;
1443ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1444ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1445ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1446ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1447ccdfe979SStefano Zampini }
1448ccdfe979SStefano Zampini 
1449ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1450ccdfe979SStefano Zampini 
1451ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1452ccdfe979SStefano Zampini {
1453ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1454ccdfe979SStefano Zampini   Mat                          A,B;
1455ccdfe979SStefano Zampini   PetscInt                     m,n,k,blda,clda;
1456ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1457ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1458ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1459ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1460ccdfe979SStefano Zampini   cusparseMatDescr_t           matA;
1461ccdfe979SStefano Zampini   const PetscScalar            *barray;
1462ccdfe979SStefano Zampini   PetscScalar                  *carray;
1463ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1464ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1465ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1466ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1467c8378d12SStefano Zampini   cudaError_t                  cuer;
1468ccdfe979SStefano Zampini 
1469ccdfe979SStefano Zampini   PetscFunctionBegin;
1470ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1471ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1472ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1473ccdfe979SStefano Zampini   A    = product->A;
1474ccdfe979SStefano Zampini   B    = product->B;
1475ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1476ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1477ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1478ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1479ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1480ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1481ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1482ccdfe979SStefano Zampini   switch (product->type) {
1483ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1484ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1485ccdfe979SStefano Zampini     mat = cusp->mat;
1486ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1487ccdfe979SStefano Zampini     m   = A->rmap->n;
1488ccdfe979SStefano Zampini     k   = A->cmap->n;
1489ccdfe979SStefano Zampini     n   = B->cmap->n;
1490ccdfe979SStefano Zampini     break;
1491ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1492e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1493e6e9a74fSStefano Zampini       mat = cusp->mat;
1494e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1495e6e9a74fSStefano Zampini     } else {
1496ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1497ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1498ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1499e6e9a74fSStefano Zampini     }
1500ccdfe979SStefano Zampini     m = A->cmap->n;
1501ccdfe979SStefano Zampini     k = A->rmap->n;
1502ccdfe979SStefano Zampini     n = B->cmap->n;
1503ccdfe979SStefano Zampini     break;
1504ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1505ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1506ccdfe979SStefano Zampini     mat = cusp->mat;
1507ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1508ccdfe979SStefano Zampini     m   = A->rmap->n;
1509ccdfe979SStefano Zampini     k   = B->cmap->n;
1510ccdfe979SStefano Zampini     n   = B->rmap->n;
1511ccdfe979SStefano Zampini     break;
1512ccdfe979SStefano Zampini   default:
1513ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1514ccdfe979SStefano Zampini   }
1515ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1516ccdfe979SStefano Zampini   matA   = mat->descr;
1517ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1518ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1519ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1520ccdfe979SStefano Zampini   if (!biscuda) {
1521ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1522ccdfe979SStefano Zampini   }
1523ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1524ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1525c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1526c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1527c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1528c8378d12SStefano Zampini   } else {
1529c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1530c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1531c8378d12SStefano Zampini   }
1532c8378d12SStefano Zampini 
1533c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534c8378d12SStefano Zampini 
1535ccdfe979SStefano Zampini   /* cusparse spmm does not support transpose on B */
1536ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1537ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
1538ccdfe979SStefano Zampini     cublasStatus_t cerr;
1539ccdfe979SStefano Zampini 
1540ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1541ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1542ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
1543ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
1544ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
1545ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1546ccdfe979SStefano Zampini     blda = B->cmap->n;
1547ccdfe979SStefano Zampini   }
1548ccdfe979SStefano Zampini 
1549ccdfe979SStefano Zampini   /* perform the MatMat operation */
1550ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1551ccdfe979SStefano Zampini                            csrmat->num_entries,mat->alpha,matA,
1552ccdfe979SStefano Zampini                            csrmat->values->data().get(),
1553ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
1554ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
1555ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1556ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
1557c8378d12SStefano Zampini   cuer = WaitForGPU();CHKERRCUDA(cuer);
1558c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1559c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1560ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1561ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
1562ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1563ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1564ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
1565ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1566ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1567ccdfe979SStefano Zampini   } else {
1568ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1569ccdfe979SStefano Zampini   }
1570ccdfe979SStefano Zampini   if (mmdata->cisdense) {
1571ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1572ccdfe979SStefano Zampini   }
1573ccdfe979SStefano Zampini   if (!biscuda) {
1574ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1575ccdfe979SStefano Zampini   }
1576ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1577ccdfe979SStefano Zampini }
1578ccdfe979SStefano Zampini 
1579ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1580ccdfe979SStefano Zampini {
1581ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
1582ccdfe979SStefano Zampini   Mat                A,B;
1583ccdfe979SStefano Zampini   PetscInt           m,n;
1584ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
1585ccdfe979SStefano Zampini   PetscErrorCode     ierr;
1586ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
1587ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
1588ccdfe979SStefano Zampini 
1589ccdfe979SStefano Zampini   PetscFunctionBegin;
1590ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1591ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1592ccdfe979SStefano Zampini   A    = product->A;
1593ccdfe979SStefano Zampini   B    = product->B;
1594ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1595ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1596ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1597ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1598ccdfe979SStefano Zampini   switch (product->type) {
1599ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1600ccdfe979SStefano Zampini     m = A->rmap->n;
1601ccdfe979SStefano Zampini     n = B->cmap->n;
1602ccdfe979SStefano Zampini     break;
1603ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1604ccdfe979SStefano Zampini     m = A->cmap->n;
1605ccdfe979SStefano Zampini     n = B->cmap->n;
1606ccdfe979SStefano Zampini     break;
1607ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1608ccdfe979SStefano Zampini     m = A->rmap->n;
1609ccdfe979SStefano Zampini     n = B->rmap->n;
1610ccdfe979SStefano Zampini     break;
1611ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1612ccdfe979SStefano Zampini     m = B->cmap->n;
1613ccdfe979SStefano Zampini     n = B->cmap->n;
1614ccdfe979SStefano Zampini     break;
1615ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1616ccdfe979SStefano Zampini     m = B->rmap->n;
1617ccdfe979SStefano Zampini     n = B->rmap->n;
1618ccdfe979SStefano Zampini     break;
1619ccdfe979SStefano Zampini   default:
1620ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1621ccdfe979SStefano Zampini   }
1622ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1623ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1624ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1625ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1626ccdfe979SStefano Zampini 
1627ccdfe979SStefano Zampini   /* product data */
1628ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1629ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
1630ccdfe979SStefano Zampini   /* cusparse spmm does not support transpose on B */
1631ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1632ccdfe979SStefano Zampini     cudaError_t cerr;
1633ccdfe979SStefano Zampini 
1634ccdfe979SStefano Zampini     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1635ccdfe979SStefano Zampini   }
1636ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
1637ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1638ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1639ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1640ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1641ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1642ccdfe979SStefano Zampini     } else {
1643ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1644ccdfe979SStefano Zampini     }
1645ccdfe979SStefano Zampini   }
1646ccdfe979SStefano Zampini   C->product->data    = mmdata;
1647ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
1648ccdfe979SStefano Zampini 
1649ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1650ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1651ccdfe979SStefano Zampini }
1652ccdfe979SStefano Zampini 
1653ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1654ccdfe979SStefano Zampini 
1655ccdfe979SStefano Zampini /* handles dense B */
1656ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1657ccdfe979SStefano Zampini {
1658ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
1659ccdfe979SStefano Zampini   PetscErrorCode ierr;
1660ccdfe979SStefano Zampini 
1661ccdfe979SStefano Zampini   PetscFunctionBegin;
1662ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1663ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1664ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
1665ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1666ccdfe979SStefano Zampini     PetscFunctionReturn(0);
1667ccdfe979SStefano Zampini   }
1668ccdfe979SStefano Zampini   switch (product->type) {
1669ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1670ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1671ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1672ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1673ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1674ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1675ccdfe979SStefano Zampini   default:
1676ccdfe979SStefano Zampini     break;
1677ccdfe979SStefano Zampini   }
1678ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1679ccdfe979SStefano Zampini }
1680ccdfe979SStefano Zampini 
16816fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
16829ae82921SPaul Mullowney {
1683b175d8bbSPaul Mullowney   PetscErrorCode ierr;
16849ae82921SPaul Mullowney 
16859ae82921SPaul Mullowney   PetscFunctionBegin;
1686e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1687e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1688e6e9a74fSStefano Zampini }
1689e6e9a74fSStefano Zampini 
1690e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1691e6e9a74fSStefano Zampini {
1692e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1693e6e9a74fSStefano Zampini 
1694e6e9a74fSStefano Zampini   PetscFunctionBegin;
1695e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1696e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1697e6e9a74fSStefano Zampini }
1698e6e9a74fSStefano Zampini 
1699e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1700e6e9a74fSStefano Zampini {
1701e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1702e6e9a74fSStefano Zampini 
1703e6e9a74fSStefano Zampini   PetscFunctionBegin;
1704e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1705e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1706e6e9a74fSStefano Zampini }
1707e6e9a74fSStefano Zampini 
1708e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1709e6e9a74fSStefano Zampini {
1710e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1711e6e9a74fSStefano Zampini 
1712e6e9a74fSStefano Zampini   PetscFunctionBegin;
1713e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
17149ae82921SPaul Mullowney   PetscFunctionReturn(0);
17159ae82921SPaul Mullowney }
17169ae82921SPaul Mullowney 
17176fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1718ca45077fSPaul Mullowney {
1719b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1720ca45077fSPaul Mullowney 
1721ca45077fSPaul Mullowney   PetscFunctionBegin;
1722e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1723ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1724ca45077fSPaul Mullowney }
1725ca45077fSPaul Mullowney 
1726e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
17279ae82921SPaul Mullowney {
17289ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1729aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17309ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1731e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1732b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
173357d48284SJunchao Zhang   cudaError_t                  cerr;
1734aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1735e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1736e6e9a74fSStefano Zampini   PetscBool                    compressed;
17376e111a19SKarl Rupp 
17389ae82921SPaul Mullowney   PetscFunctionBegin;
1739e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1740e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
1741e6e9a74fSStefano Zampini     if (!yy) {
1742e6e9a74fSStefano Zampini       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1743e6e9a74fSStefano Zampini     }
1744e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
1745e6e9a74fSStefano Zampini   }
174634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
174734d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1748e6e9a74fSStefano Zampini   if (!trans) {
17499ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1750e6e9a74fSStefano Zampini   } else {
1751e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
1752e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1753e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1754e6e9a74fSStefano Zampini     } else {
1755e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1756e6e9a74fSStefano Zampini       if (!matstruct) {
1757e6e9a74fSStefano Zampini         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1758e6e9a74fSStefano Zampini         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1759e6e9a74fSStefano Zampini       }
1760e6e9a74fSStefano Zampini     }
1761e6e9a74fSStefano Zampini   }
1762e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1763e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1764213423ffSJunchao Zhang 
1765e6e9a74fSStefano Zampini   try {
1766e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1767213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1768213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1769*85ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1770e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1771e6e9a74fSStefano Zampini       xptr = xarray;
1772e6e9a74fSStefano Zampini       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1773213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1774e6e9a74fSStefano Zampini     } else {
1775e6e9a74fSStefano Zampini       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1776e6e9a74fSStefano Zampini       dptr = zarray;
1777e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1778e6e9a74fSStefano Zampini 
1779e6e9a74fSStefano Zampini       if (compressed) {
1780e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1781e6e9a74fSStefano Zampini 
1782e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1783e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1784e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
1785e6e9a74fSStefano Zampini       }
1786e6e9a74fSStefano Zampini     }
17879ae82921SPaul Mullowney 
1788213423ffSJunchao Zhang     /* csr_spmv does y = alpha*Ax + beta*y */
1789aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
17907656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1791e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1792a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1793b06137fdSPaul Mullowney                                mat->num_entries, matstruct->alpha, matstruct->descr,
1794aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1795e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
179657d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
1797aa372e3fSPaul Mullowney     } else {
1798213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
1799301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1800e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1801b06137fdSPaul Mullowney                                  matstruct->alpha, matstruct->descr, hybMat,
1802e6e9a74fSStefano Zampini                                  xptr, beta,
180357d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
1804a65300a6SPaul Mullowney       }
1805aa372e3fSPaul Mullowney     }
1806e6e9a74fSStefano Zampini     cerr = WaitForGPU();CHKERRCUDA(cerr);
1807958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1808aa372e3fSPaul Mullowney 
1809e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1810213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1811213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1812213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1813e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1814213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
18157656d835SStefano Zampini         }
1816213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1817c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
18187656d835SStefano Zampini       }
18197656d835SStefano Zampini 
1820213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1821213423ffSJunchao Zhang       if (compressed) {
1822213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1823e6e9a74fSStefano Zampini 
1824e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1825c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1826e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1827c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
1828e6e9a74fSStefano Zampini         cerr = WaitForGPU();CHKERRCUDA(cerr);
1829958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1830e6e9a74fSStefano Zampini       }
1831e6e9a74fSStefano Zampini     } else {
1832e6e9a74fSStefano Zampini       if (yy && yy != zz) {
1833e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1834e6e9a74fSStefano Zampini       }
1835e6e9a74fSStefano Zampini     }
1836e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1837213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1838213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
18399ae82921SPaul Mullowney   } catch(char *ex) {
18409ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18419ae82921SPaul Mullowney   }
1842e6e9a74fSStefano Zampini   if (yy) {
1843958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1844e6e9a74fSStefano Zampini   } else {
1845e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1846e6e9a74fSStefano Zampini   }
18479ae82921SPaul Mullowney   PetscFunctionReturn(0);
18489ae82921SPaul Mullowney }
18499ae82921SPaul Mullowney 
18506fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1851ca45077fSPaul Mullowney {
1852b175d8bbSPaul Mullowney   PetscErrorCode ierr;
18536e111a19SKarl Rupp 
1854ca45077fSPaul Mullowney   PetscFunctionBegin;
1855e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1856ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1857ca45077fSPaul Mullowney }
1858ca45077fSPaul Mullowney 
18596fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
18609ae82921SPaul Mullowney {
18619ae82921SPaul Mullowney   PetscErrorCode ierr;
18626e111a19SKarl Rupp 
18639ae82921SPaul Mullowney   PetscFunctionBegin;
18649ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
186595639643SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1866bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1867e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1868bc3f50f2SPaul Mullowney   }
18699ae82921SPaul Mullowney   PetscFunctionReturn(0);
18709ae82921SPaul Mullowney }
18719ae82921SPaul Mullowney 
18729ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
1873e057df02SPaul Mullowney /*@
18749ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1875e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1876e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1877e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1878e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1879e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
18809ae82921SPaul Mullowney 
1881d083f849SBarry Smith    Collective
18829ae82921SPaul Mullowney 
18839ae82921SPaul Mullowney    Input Parameters:
18849ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
18859ae82921SPaul Mullowney .  m - number of rows
18869ae82921SPaul Mullowney .  n - number of columns
18879ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
18889ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
18890298fd71SBarry Smith          (possibly different for each row) or NULL
18909ae82921SPaul Mullowney 
18919ae82921SPaul Mullowney    Output Parameter:
18929ae82921SPaul Mullowney .  A - the matrix
18939ae82921SPaul Mullowney 
18949ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
18959ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
18969ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
18979ae82921SPaul Mullowney 
18989ae82921SPaul Mullowney    Notes:
18999ae82921SPaul Mullowney    If nnz is given then nz is ignored
19009ae82921SPaul Mullowney 
19019ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
19029ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
19039ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
19049ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
19059ae82921SPaul Mullowney 
19069ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
19070298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
19089ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
19099ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
19109ae82921SPaul Mullowney 
19119ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
19129ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
19139ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
19149ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
19159ae82921SPaul Mullowney 
19169ae82921SPaul Mullowney    Level: intermediate
19179ae82921SPaul Mullowney 
1918e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
19199ae82921SPaul Mullowney @*/
19209ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
19219ae82921SPaul Mullowney {
19229ae82921SPaul Mullowney   PetscErrorCode ierr;
19239ae82921SPaul Mullowney 
19249ae82921SPaul Mullowney   PetscFunctionBegin;
19259ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
19269ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
19279ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
19289ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
19299ae82921SPaul Mullowney   PetscFunctionReturn(0);
19309ae82921SPaul Mullowney }
19319ae82921SPaul Mullowney 
19326fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
19339ae82921SPaul Mullowney {
19349ae82921SPaul Mullowney   PetscErrorCode ierr;
1935ab25e6cbSDominic Meiser 
19369ae82921SPaul Mullowney   PetscFunctionBegin;
19379ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1938470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
19399ae82921SPaul Mullowney   } else {
1940470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1941aa372e3fSPaul Mullowney   }
1942ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1943ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1944ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1945ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
19469ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
19479ae82921SPaul Mullowney   PetscFunctionReturn(0);
19489ae82921SPaul Mullowney }
19499ae82921SPaul Mullowney 
1950ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
195195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
19529ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
19539ff858a8SKarl Rupp {
19549ff858a8SKarl Rupp   PetscErrorCode ierr;
19559ff858a8SKarl Rupp 
19569ff858a8SKarl Rupp   PetscFunctionBegin;
19579ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1958ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
19599ff858a8SKarl Rupp   PetscFunctionReturn(0);
19609ff858a8SKarl Rupp }
19619ff858a8SKarl Rupp 
196295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
196395639643SRichard Tran Mills {
1964e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1965e6e9a74fSStefano Zampini 
196695639643SRichard Tran Mills   PetscFunctionBegin;
1967e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1968c34f1ff0SRichard Tran Mills   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
196995639643SRichard Tran Mills      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1970e6e9a74fSStefano Zampini      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
197195639643SRichard Tran Mills      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
197295639643SRichard Tran Mills            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1973e6e9a74fSStefano Zampini   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1974e6e9a74fSStefano Zampini   /* TODO: add support for this? */
197595639643SRichard Tran Mills   if (flg) {
197695639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
197795639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1978c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1979c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1980e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
1981e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
1982e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1983e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
198495639643SRichard Tran Mills   } else {
198595639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
198695639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
198795639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
198895639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1989e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
1990e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
1991e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1992e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
199395639643SRichard Tran Mills   }
199495639643SRichard Tran Mills   A->boundtocpu = flg;
199595639643SRichard Tran Mills   PetscFunctionReturn(0);
199695639643SRichard Tran Mills }
199795639643SRichard Tran Mills 
199849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
19999ae82921SPaul Mullowney {
20009ae82921SPaul Mullowney   PetscErrorCode   ierr;
2001aa372e3fSPaul Mullowney   cusparseStatus_t stat;
200249735bf3SStefano Zampini   Mat              B;
20039ae82921SPaul Mullowney 
20049ae82921SPaul Mullowney   PetscFunctionBegin;
200549735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
200649735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
200749735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
200849735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
200949735bf3SStefano Zampini   }
201049735bf3SStefano Zampini   B = *newmat;
201149735bf3SStefano Zampini 
201234136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
201334136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
201434136279SStefano Zampini 
201549735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
20169ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2017e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2018e6e9a74fSStefano Zampini 
2019e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2020e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2021e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2022e6e9a74fSStefano Zampini       B->spptr = spptr;
20239ae82921SPaul Mullowney     } else {
2024e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2025e6e9a74fSStefano Zampini 
2026e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2027e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2028e6e9a74fSStefano Zampini       B->spptr = spptr;
20299ae82921SPaul Mullowney     }
2030e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
203149735bf3SStefano Zampini   }
2032693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
20339ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
20349ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
203595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2036693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
20372205254eSKarl Rupp 
2038e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
20399ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2040bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
20419ae82921SPaul Mullowney   PetscFunctionReturn(0);
20429ae82921SPaul Mullowney }
20439ae82921SPaul Mullowney 
204402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
204502fe1965SBarry Smith {
204602fe1965SBarry Smith   PetscErrorCode ierr;
204702fe1965SBarry Smith 
204802fe1965SBarry Smith   PetscFunctionBegin;
204902fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
20500ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
205102fe1965SBarry Smith   PetscFunctionReturn(0);
205202fe1965SBarry Smith }
205302fe1965SBarry Smith 
20543ca39a21SBarry Smith /*MC
2055e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2056e057df02SPaul Mullowney 
2057e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
20582692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
20592692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2060e057df02SPaul Mullowney 
2061e057df02SPaul Mullowney    Options Database Keys:
2062e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2063aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2064a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2065e057df02SPaul Mullowney 
2066e057df02SPaul Mullowney   Level: beginner
2067e057df02SPaul Mullowney 
20688468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2069e057df02SPaul Mullowney M*/
20707f756511SDominic Meiser 
207142c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
207242c9c57cSBarry Smith 
20730f39cd5aSBarry Smith 
20743ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
207542c9c57cSBarry Smith {
207642c9c57cSBarry Smith   PetscErrorCode ierr;
207742c9c57cSBarry Smith 
207842c9c57cSBarry Smith   PetscFunctionBegin;
20793ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20803ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20813ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20823ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
208342c9c57cSBarry Smith   PetscFunctionReturn(0);
208442c9c57cSBarry Smith }
208529b38603SBarry Smith 
2086470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
20877f756511SDominic Meiser {
2088e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
20897f756511SDominic Meiser   cusparseStatus_t stat;
20907f756511SDominic Meiser   cusparseHandle_t handle;
20917f756511SDominic Meiser 
20927f756511SDominic Meiser   PetscFunctionBegin;
20937f756511SDominic Meiser   if (*cusparsestruct) {
2094e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2095e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
20967f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
209781902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
20987f756511SDominic Meiser     if (handle = (*cusparsestruct)->handle) {
209957d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
21007f756511SDominic Meiser     }
2101e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
21027f756511SDominic Meiser   }
21037f756511SDominic Meiser   PetscFunctionReturn(0);
21047f756511SDominic Meiser }
21057f756511SDominic Meiser 
21067f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
21077f756511SDominic Meiser {
21087f756511SDominic Meiser   PetscFunctionBegin;
21097f756511SDominic Meiser   if (*mat) {
21107f756511SDominic Meiser     delete (*mat)->values;
21117f756511SDominic Meiser     delete (*mat)->column_indices;
21127f756511SDominic Meiser     delete (*mat)->row_offsets;
21137f756511SDominic Meiser     delete *mat;
21147f756511SDominic Meiser     *mat = 0;
21157f756511SDominic Meiser   }
21167f756511SDominic Meiser   PetscFunctionReturn(0);
21177f756511SDominic Meiser }
21187f756511SDominic Meiser 
2119470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
21207f756511SDominic Meiser {
21217f756511SDominic Meiser   cusparseStatus_t stat;
21227f756511SDominic Meiser   PetscErrorCode   ierr;
21237f756511SDominic Meiser 
21247f756511SDominic Meiser   PetscFunctionBegin;
21257f756511SDominic Meiser   if (*trifactor) {
212657d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
212757d48284SJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
21287f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
21297f756511SDominic Meiser     delete *trifactor;
21307f756511SDominic Meiser     *trifactor = 0;
21317f756511SDominic Meiser   }
21327f756511SDominic Meiser   PetscFunctionReturn(0);
21337f756511SDominic Meiser }
21347f756511SDominic Meiser 
2135470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
21367f756511SDominic Meiser {
21377f756511SDominic Meiser   CsrMatrix        *mat;
21387f756511SDominic Meiser   cusparseStatus_t stat;
21397f756511SDominic Meiser   cudaError_t      err;
21407f756511SDominic Meiser 
21417f756511SDominic Meiser   PetscFunctionBegin;
21427f756511SDominic Meiser   if (*matstruct) {
21437f756511SDominic Meiser     if ((*matstruct)->mat) {
21447f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
21457f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
214657d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
21477f756511SDominic Meiser       } else {
21487f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
21497f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
21507f756511SDominic Meiser       }
21517f756511SDominic Meiser     }
215257d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
21537f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2154c41cb2e2SAlejandro Lamas Daviña     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
21557656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
21567656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
21577f756511SDominic Meiser     delete *matstruct;
21587f756511SDominic Meiser     *matstruct = 0;
21597f756511SDominic Meiser   }
21607f756511SDominic Meiser   PetscFunctionReturn(0);
21617f756511SDominic Meiser }
21627f756511SDominic Meiser 
2163ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
21647f756511SDominic Meiser {
2165e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2166e6e9a74fSStefano Zampini 
21677f756511SDominic Meiser   PetscFunctionBegin;
21687f756511SDominic Meiser   if (*trifactors) {
2169e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2170e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2171e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2172e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
21737f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
21747f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
21757f756511SDominic Meiser     delete (*trifactors)->workVector;
2176ccdfe979SStefano Zampini     (*trifactors)->rpermIndices = 0;
2177ccdfe979SStefano Zampini     (*trifactors)->cpermIndices = 0;
2178ccdfe979SStefano Zampini     (*trifactors)->workVector = 0;
2179ccdfe979SStefano Zampini   }
2180ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2181ccdfe979SStefano Zampini }
2182ccdfe979SStefano Zampini 
2183ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2184ccdfe979SStefano Zampini {
2185e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2186ccdfe979SStefano Zampini   cusparseHandle_t handle;
2187ccdfe979SStefano Zampini   cusparseStatus_t stat;
2188ccdfe979SStefano Zampini 
2189ccdfe979SStefano Zampini   PetscFunctionBegin;
2190ccdfe979SStefano Zampini   if (*trifactors) {
2191e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
21927f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
219357d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
21947f756511SDominic Meiser     }
2195e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
21967f756511SDominic Meiser   }
21977f756511SDominic Meiser   PetscFunctionReturn(0);
21987f756511SDominic Meiser }
2199