xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 95a2cb335deee435f0b06953e4461b2237b5f64e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
84 
85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86 {
87   cusparseStatus_t   stat;
88   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89 
90   PetscFunctionBegin;
91   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92   cusparsestruct->stream = stream;
93   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94   PetscFunctionReturn(0);
95 }
96 
97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98 {
99   cusparseStatus_t   stat;
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101 
102   PetscFunctionBegin;
103   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104   if (cusparsestruct->handle != handle) {
105     if (cusparsestruct->handle) {
106       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
107     }
108     cusparsestruct->handle = handle;
109   }
110   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111   PetscFunctionReturn(0);
112 }
113 
114 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
117   PetscBool          flg;
118   PetscErrorCode     ierr;
119 
120   PetscFunctionBegin;
121   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
122   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
128 {
129   PetscFunctionBegin;
130   *type = MATSOLVERCUSPARSE;
131   PetscFunctionReturn(0);
132 }
133 
134 /*MC
135   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140   algorithms are not recommended. This class does NOT support direct solver operations.
141 
142   Level: beginner
143 
144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145 M*/
146 
147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
148 {
149   PetscErrorCode ierr;
150   PetscInt       n = A->rmap->n;
151 
152   PetscFunctionBegin;
153   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
155   (*B)->factortype = ftype;
156   (*B)->useordering = PETSC_TRUE;
157   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
158 
159   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
160     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
161     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
162     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
166   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167 
168   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
169   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   switch (op) {
179   case MAT_CUSPARSE_MULT:
180     cusparsestruct->format = format;
181     break;
182   case MAT_CUSPARSE_ALL:
183     cusparsestruct->format = format;
184     break;
185   default:
186     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187   }
188   PetscFunctionReturn(0);
189 }
190 
191 /*@
192    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193    operation. Only the MatMult operation can use different GPU storage formats
194    for MPIAIJCUSPARSE matrices.
195    Not Collective
196 
197    Input Parameters:
198 +  A - Matrix of type SEQAIJCUSPARSE
199 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
200 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201 
202    Output Parameter:
203 
204    Level: intermediate
205 
206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207 @*/
208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /*@
219    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220 
221    Collective on mat
222 
223    Input Parameters:
224 +  A - Matrix of type SEQAIJCUSPARSE
225 -  transgen - the boolean flag
226 
227    Level: intermediate
228 
229 .seealso: MATSEQAIJCUSPARSE
230 @*/
231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232 {
233   PetscErrorCode ierr;
234   PetscBool      flg;
235 
236   PetscFunctionBegin;
237   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239   if (flg) {
240     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243     cusp->transgen = transgen;
244     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
245       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
246     }
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
252 {
253   PetscErrorCode           ierr;
254   MatCUSPARSEStorageFormat format;
255   PetscBool                flg;
256   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
257 
258   PetscFunctionBegin;
259   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
260   if (A->factortype == MAT_FACTOR_NONE) {
261     PetscBool transgen = cusparsestruct->transgen;
262 
263     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265 
266     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279 
280     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284 
285     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289    #endif
290   }
291   ierr = PetscOptionsTail();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
296 {
297   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
298   PetscErrorCode               ierr;
299 
300   PetscFunctionBegin;
301   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
302   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
303   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
304   PetscFunctionReturn(0);
305 }
306 
307 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
308 {
309   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
310   PetscErrorCode               ierr;
311 
312   PetscFunctionBegin;
313   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
314   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
315   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
316   PetscFunctionReturn(0);
317 }
318 
319 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320 {
321   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
322   PetscErrorCode               ierr;
323 
324   PetscFunctionBegin;
325   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
326   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
327   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
332 {
333   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334   PetscErrorCode               ierr;
335 
336   PetscFunctionBegin;
337   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
338   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
339   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
340   PetscFunctionReturn(0);
341 }
342 
343 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
344 {
345   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
346   PetscInt                          n = A->rmap->n;
347   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
348   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
349   cusparseStatus_t                  stat;
350   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
351   const MatScalar                   *aa = a->a,*v;
352   PetscInt                          *AiLo, *AjLo;
353   PetscScalar                       *AALo;
354   PetscInt                          i,nz, nzLower, offset, rowOffset;
355   PetscErrorCode                    ierr;
356   cudaError_t                       cerr;
357 
358   PetscFunctionBegin;
359   if (!n) PetscFunctionReturn(0);
360   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
361     try {
362       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
363       nzLower=n+ai[n]-ai[1];
364       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
365       if (!loTriFactor) {
366 
367         /* Allocate Space for the lower triangular matrix */
368         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
369         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
370 
371         /* Fill the lower triangular matrix */
372         AiLo[0]  = (PetscInt) 0;
373         AiLo[n]  = nzLower;
374         AjLo[0]  = (PetscInt) 0;
375         AALo[0]  = (MatScalar) 1.0;
376         v        = aa;
377         vi       = aj;
378         offset   = 1;
379         rowOffset= 1;
380         for (i=1; i<n; i++) {
381           nz = ai[i+1] - ai[i];
382           /* additional 1 for the term on the diagonal */
383           AiLo[i]    = rowOffset;
384           rowOffset += nz+1;
385 
386           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
387           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
388 
389           offset      += nz;
390           AjLo[offset] = (PetscInt) i;
391           AALo[offset] = (MatScalar) 1.0;
392           offset      += 1;
393 
394           v  += nz;
395           vi += nz;
396         }
397 
398         /* allocate space for the triangular factor information */
399         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
400         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
401         /* Create the matrix description */
402         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
403         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
404        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
405         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
406        #else
407         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
408        #endif
409         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
410         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
411 
412         /* set the operation */
413         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
414 
415         /* set the matrix */
416         loTriFactor->csrMat = new CsrMatrix;
417         loTriFactor->csrMat->num_rows = n;
418         loTriFactor->csrMat->num_cols = n;
419         loTriFactor->csrMat->num_entries = nzLower;
420 
421         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
422         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
423 
424         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
425         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
426 
427         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
428         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
429 
430         /* Create the solve analysis information */
431         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
432         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
433       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
435                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
436                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
437                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
438                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
439         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
440       #endif
441 
442         /* perform the solve analysis */
443         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
444                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
445                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
446                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
447                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
448                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
449                                #endif
450                                 );CHKERRCUSPARSE(stat);
451         cerr = WaitForCUDA();CHKERRCUDA(cerr);
452         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
453 
454         /* assign the pointer */
455         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
456 
457         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
458         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
459         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
460       } else { /* update values only */
461         /* Fill the lower triangular matrix */
462         AALo[0]  = 1.0;
463         v        = aa;
464         vi       = aj;
465         offset   = 1;
466         for (i=1; i<n; i++) {
467           nz = ai[i+1] - ai[i];
468           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
469           offset      += nz;
470           AALo[offset] = 1.0;
471           offset      += 1;
472           v  += nz;
473         }
474         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
475         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
476       }
477       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
478     } catch(char *ex) {
479       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
480     }
481   }
482   PetscFunctionReturn(0);
483 }
484 
485 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
486 {
487   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
488   PetscInt                          n = A->rmap->n;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
491   cusparseStatus_t                  stat;
492   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
493   const MatScalar                   *aa = a->a,*v;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscInt                          i,nz, nzUpper, offset;
497   PetscErrorCode                    ierr;
498   cudaError_t                       cerr;
499 
500   PetscFunctionBegin;
501   if (!n) PetscFunctionReturn(0);
502   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
503     try {
504       /* next, figure out the number of nonzeros in the upper triangular matrix. */
505       nzUpper = adiag[0]-adiag[n];
506       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
507       if (!upTriFactor) {
508         /* Allocate Space for the upper triangular matrix */
509         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
510         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
511 
512         /* Fill the upper triangular matrix */
513         AiUp[0]=(PetscInt) 0;
514         AiUp[n]=nzUpper;
515         offset = nzUpper;
516         for (i=n-1; i>=0; i--) {
517           v  = aa + adiag[i+1] + 1;
518           vi = aj + adiag[i+1] + 1;
519 
520           /* number of elements NOT on the diagonal */
521           nz = adiag[i] - adiag[i+1]-1;
522 
523           /* decrement the offset */
524           offset -= (nz+1);
525 
526           /* first, set the diagonal elements */
527           AjUp[offset] = (PetscInt) i;
528           AAUp[offset] = (MatScalar)1./v[nz];
529           AiUp[i]      = AiUp[i+1] - (nz+1);
530 
531           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
532           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
533         }
534 
535         /* allocate space for the triangular factor information */
536         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
537         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
538 
539         /* Create the matrix description */
540         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
541         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
542        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
543         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
544        #else
545         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
546        #endif
547         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
548         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
549 
550         /* set the operation */
551         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
552 
553         /* set the matrix */
554         upTriFactor->csrMat = new CsrMatrix;
555         upTriFactor->csrMat->num_rows = n;
556         upTriFactor->csrMat->num_cols = n;
557         upTriFactor->csrMat->num_entries = nzUpper;
558 
559         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
560         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
561 
562         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
563         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
564 
565         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
566         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
567 
568         /* Create the solve analysis information */
569         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
570         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
571       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
576                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
577         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
578       #endif
579 
580         /* perform the solve analysis */
581         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
582                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
585                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
587                                #endif
588                                 );CHKERRCUSPARSE(stat);
589         cerr = WaitForCUDA();CHKERRCUDA(cerr);
590         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
591 
592         /* assign the pointer */
593         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
594 
595         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
596         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
597         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
598       } else {
599         /* Fill the upper triangular matrix */
600         offset = nzUpper;
601         for (i=n-1; i>=0; i--) {
602           v  = aa + adiag[i+1] + 1;
603 
604           /* number of elements NOT on the diagonal */
605           nz = adiag[i] - adiag[i+1]-1;
606 
607           /* decrement the offset */
608           offset -= (nz+1);
609 
610           /* first, set the diagonal elements */
611           AAUp[offset] = 1./v[nz];
612           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
613         }
614         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
615         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
616       }
617       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
618     } catch(char *ex) {
619       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
620     }
621   }
622   PetscFunctionReturn(0);
623 }
624 
625 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
626 {
627   PetscErrorCode               ierr;
628   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
629   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
630   IS                           isrow = a->row,iscol = a->icol;
631   PetscBool                    row_identity,col_identity;
632   PetscInt                     n = A->rmap->n;
633 
634   PetscFunctionBegin;
635   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
636   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
637   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
638 
639   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
640   cusparseTriFactors->nnz=a->nz;
641 
642   A->offloadmask = PETSC_OFFLOAD_BOTH;
643   /* lower triangular indices */
644   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
645   if (!row_identity && !cusparseTriFactors->rpermIndices) {
646     const PetscInt *r;
647 
648     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
649     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
650     cusparseTriFactors->rpermIndices->assign(r, r+n);
651     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
652     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
653   }
654 
655   /* upper triangular indices */
656   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
657   if (!col_identity && !cusparseTriFactors->cpermIndices) {
658     const PetscInt *c;
659 
660     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
661     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
662     cusparseTriFactors->cpermIndices->assign(c, c+n);
663     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
664     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
665   }
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
670 {
671   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
672   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
673   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
674   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
675   cusparseStatus_t                  stat;
676   PetscErrorCode                    ierr;
677   cudaError_t                       cerr;
678   PetscInt                          *AiUp, *AjUp;
679   PetscScalar                       *AAUp;
680   PetscScalar                       *AALo;
681   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
682   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
683   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
684   const MatScalar                   *aa = b->a,*v;
685 
686   PetscFunctionBegin;
687   if (!n) PetscFunctionReturn(0);
688   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
689     try {
690       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
691       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692       if (!upTriFactor && !loTriFactor) {
693         /* Allocate Space for the upper triangular matrix */
694         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
695         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
696 
697         /* Fill the upper triangular matrix */
698         AiUp[0]=(PetscInt) 0;
699         AiUp[n]=nzUpper;
700         offset = 0;
701         for (i=0; i<n; i++) {
702           /* set the pointers */
703           v  = aa + ai[i];
704           vj = aj + ai[i];
705           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
706 
707           /* first, set the diagonal elements */
708           AjUp[offset] = (PetscInt) i;
709           AAUp[offset] = (MatScalar)1.0/v[nz];
710           AiUp[i]      = offset;
711           AALo[offset] = (MatScalar)1.0/v[nz];
712 
713           offset+=1;
714           if (nz>0) {
715             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
716             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
717             for (j=offset; j<offset+nz; j++) {
718               AAUp[j] = -AAUp[j];
719               AALo[j] = AAUp[j]/v[nz];
720             }
721             offset+=nz;
722           }
723         }
724 
725         /* allocate space for the triangular factor information */
726         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
727         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
728 
729         /* Create the matrix description */
730         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
731         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
732        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
733         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734        #else
735         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736        #endif
737         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
738         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
739 
740         /* set the matrix */
741         upTriFactor->csrMat = new CsrMatrix;
742         upTriFactor->csrMat->num_rows = A->rmap->n;
743         upTriFactor->csrMat->num_cols = A->cmap->n;
744         upTriFactor->csrMat->num_entries = a->nz;
745 
746         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
748 
749         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
751 
752         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
754 
755         /* set the operation */
756         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
757 
758         /* Create the solve analysis information */
759         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
760         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
761       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
762         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
763                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
764                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
765                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
766                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768       #endif
769 
770         /* perform the solve analysis */
771         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
772                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
773                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
774                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
775                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
776                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
777                                 #endif
778                                 );CHKERRCUSPARSE(stat);
779         cerr = WaitForCUDA();CHKERRCUDA(cerr);
780         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
781 
782         /* assign the pointer */
783         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
784 
785         /* allocate space for the triangular factor information */
786         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
787         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
788 
789         /* Create the matrix description */
790         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
791         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
792        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
793         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
794        #else
795         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
796        #endif
797         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
798         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
799 
800         /* set the operation */
801         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
802 
803         /* set the matrix */
804         loTriFactor->csrMat = new CsrMatrix;
805         loTriFactor->csrMat->num_rows = A->rmap->n;
806         loTriFactor->csrMat->num_cols = A->cmap->n;
807         loTriFactor->csrMat->num_entries = a->nz;
808 
809         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
810         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
811 
812         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
813         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
814 
815         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
816         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
817 
818         /* Create the solve analysis information */
819         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
820         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
821       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
822         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
823                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
824                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
825                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
826                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
827         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
828       #endif
829 
830         /* perform the solve analysis */
831         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
832                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
833                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
834                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
835                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
836                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
837                                 #endif
838                                 );CHKERRCUSPARSE(stat);
839         cerr = WaitForCUDA();CHKERRCUDA(cerr);
840         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
841 
842         /* assign the pointer */
843         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
844 
845         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
846         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
847         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
848       } else {
849         /* Fill the upper triangular matrix */
850         offset = 0;
851         for (i=0; i<n; i++) {
852           /* set the pointers */
853           v  = aa + ai[i];
854           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
855 
856           /* first, set the diagonal elements */
857           AAUp[offset] = 1.0/v[nz];
858           AALo[offset] = 1.0/v[nz];
859 
860           offset+=1;
861           if (nz>0) {
862             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
863             for (j=offset; j<offset+nz; j++) {
864               AAUp[j] = -AAUp[j];
865               AALo[j] = AAUp[j]/v[nz];
866             }
867             offset+=nz;
868           }
869         }
870         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
871         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
872         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
873         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
874         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
875       }
876       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
877       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
878     } catch(char *ex) {
879       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
880     }
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
886 {
887   PetscErrorCode               ierr;
888   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
889   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
890   IS                           ip = a->row;
891   PetscBool                    perm_identity;
892   PetscInt                     n = A->rmap->n;
893 
894   PetscFunctionBegin;
895   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
896   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
897   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
898   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
899 
900   A->offloadmask = PETSC_OFFLOAD_BOTH;
901 
902   /* lower triangular indices */
903   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
904   if (!perm_identity) {
905     IS             iip;
906     const PetscInt *irip,*rip;
907 
908     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
909     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
910     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
911     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
912     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
913     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
914     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
915     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
916     ierr = ISDestroy(&iip);CHKERRQ(ierr);
917     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
918     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
919   }
920   PetscFunctionReturn(0);
921 }
922 
923 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
924 {
925   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
926   IS             isrow = b->row,iscol = b->col;
927   PetscBool      row_identity,col_identity;
928   PetscErrorCode ierr;
929 
930   PetscFunctionBegin;
931   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
932   B->offloadmask = PETSC_OFFLOAD_CPU;
933   /* determine which version of MatSolve needs to be used. */
934   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
935   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
936   if (row_identity && col_identity) {
937     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
938     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
939     B->ops->matsolve = NULL;
940     B->ops->matsolvetranspose = NULL;
941   } else {
942     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
943     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
944     B->ops->matsolve = NULL;
945     B->ops->matsolvetranspose = NULL;
946   }
947 
948   /* get the triangular factors */
949   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
950   PetscFunctionReturn(0);
951 }
952 
953 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954 {
955   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956   IS             ip = b->row;
957   PetscBool      perm_identity;
958   PetscErrorCode ierr;
959 
960   PetscFunctionBegin;
961   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
962   B->offloadmask = PETSC_OFFLOAD_CPU;
963   /* determine which version of MatSolve needs to be used. */
964   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
965   if (perm_identity) {
966     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
967     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
968     B->ops->matsolve = NULL;
969     B->ops->matsolvetranspose = NULL;
970   } else {
971     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
972     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
973     B->ops->matsolve = NULL;
974     B->ops->matsolvetranspose = NULL;
975   }
976 
977   /* get the triangular factors */
978   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
979   PetscFunctionReturn(0);
980 }
981 
982 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
983 {
984   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
985   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
986   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
987   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
988   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
989   cusparseStatus_t                  stat;
990   cusparseIndexBase_t               indexBase;
991   cusparseMatrixType_t              matrixType;
992   cusparseFillMode_t                fillMode;
993   cusparseDiagType_t                diagType;
994   cudaError_t                       cerr;
995   PetscErrorCode                    ierr;
996 
997   PetscFunctionBegin;
998   /* allocate space for the transpose of the lower triangular factor */
999   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1000   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1001 
1002   /* set the matrix descriptors of the lower triangular factor */
1003   matrixType = cusparseGetMatType(loTriFactor->descr);
1004   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1005   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1006     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1007   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1008 
1009   /* Create the matrix description */
1010   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1011   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1012   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1013   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1014   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1015 
1016   /* set the operation */
1017   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1018 
1019   /* allocate GPU space for the CSC of the lower triangular factor*/
1020   loTriFactorT->csrMat = new CsrMatrix;
1021   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1022   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1023   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1024   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1025   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1026   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1027 
1028   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1029 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1030   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1031                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1032                                        loTriFactor->csrMat->values->data().get(),
1033                                        loTriFactor->csrMat->row_offsets->data().get(),
1034                                        loTriFactor->csrMat->column_indices->data().get(),
1035                                        loTriFactorT->csrMat->values->data().get(),
1036                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1037                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1038                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1039   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1040 #endif
1041 
1042   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1043   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045                           loTriFactor->csrMat->values->data().get(),
1046                           loTriFactor->csrMat->row_offsets->data().get(),
1047                           loTriFactor->csrMat->column_indices->data().get(),
1048                           loTriFactorT->csrMat->values->data().get(),
1049                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1050                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1051                           CUSPARSE_ACTION_NUMERIC, indexBase,
1052                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1053                         #else
1054                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1055                           CUSPARSE_ACTION_NUMERIC, indexBase
1056                         #endif
1057                         );CHKERRCUSPARSE(stat);
1058   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1059   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1060 
1061   /* Create the solve analysis information */
1062   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1063   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1065   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1069                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1070   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1071 #endif
1072 
1073   /* perform the solve analysis */
1074   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1078                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1080                           #endif
1081                           );CHKERRCUSPARSE(stat);
1082   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1083   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1084 
1085   /* assign the pointer */
1086   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1087 
1088   /*********************************************/
1089   /* Now the Transpose of the Upper Tri Factor */
1090   /*********************************************/
1091 
1092   /* allocate space for the transpose of the upper triangular factor */
1093   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1094   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1095 
1096   /* set the matrix descriptors of the upper triangular factor */
1097   matrixType = cusparseGetMatType(upTriFactor->descr);
1098   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1099   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1100     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1101   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1102 
1103   /* Create the matrix description */
1104   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1105   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1106   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1107   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1108   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1109 
1110   /* set the operation */
1111   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1112 
1113   /* allocate GPU space for the CSC of the upper triangular factor*/
1114   upTriFactorT->csrMat = new CsrMatrix;
1115   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1116   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1117   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1118   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1119   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1120   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1121 
1122   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1124   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1125                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1126                                 upTriFactor->csrMat->values->data().get(),
1127                                 upTriFactor->csrMat->row_offsets->data().get(),
1128                                 upTriFactor->csrMat->column_indices->data().get(),
1129                                 upTriFactorT->csrMat->values->data().get(),
1130                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1131                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1132                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1133   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1134 #endif
1135 
1136   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1137   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1138                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139                           upTriFactor->csrMat->values->data().get(),
1140                           upTriFactor->csrMat->row_offsets->data().get(),
1141                           upTriFactor->csrMat->column_indices->data().get(),
1142                           upTriFactorT->csrMat->values->data().get(),
1143                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1145                           CUSPARSE_ACTION_NUMERIC, indexBase,
1146                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1147                         #else
1148                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1149                           CUSPARSE_ACTION_NUMERIC, indexBase
1150                         #endif
1151                         );CHKERRCUSPARSE(stat);
1152   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1154 
1155   /* Create the solve analysis information */
1156   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1157   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1158   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1159   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1160                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1161                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1163                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1164   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1165   #endif
1166 
1167   /* perform the solve analysis */
1168   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1172                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1173                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1174                           #endif
1175                           );CHKERRCUSPARSE(stat);
1176   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1177   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1178 
1179   /* assign the pointer */
1180   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1185 {
1186   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1187   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1188   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1189   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1190   cusparseStatus_t             stat;
1191   cusparseIndexBase_t          indexBase;
1192   cudaError_t                  err;
1193   PetscErrorCode               ierr;
1194 
1195   PetscFunctionBegin;
1196   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1197   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1198   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1199   /* create cusparse matrix */
1200   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1201   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1202   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1203   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1204   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1205 
1206   /* set alpha and beta */
1207   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1208   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1209   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1210   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1211   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1212   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1213   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1214 
1215   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1216     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1217     CsrMatrix *matrixT= new CsrMatrix;
1218     matrixT->num_rows = A->cmap->n;
1219     matrixT->num_cols = A->rmap->n;
1220     matrixT->num_entries = a->nz;
1221     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1222     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1223     matrixT->values = new THRUSTARRAY(a->nz);
1224 
1225     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1226     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1227 
1228     /* compute the transpose, i.e. the CSC */
1229    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1230     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1231                                   A->cmap->n, matrix->num_entries,
1232                                   matrix->values->data().get(),
1233                                   cusparsestruct->rowoffsets_gpu->data().get(),
1234                                   matrix->column_indices->data().get(),
1235                                   matrixT->values->data().get(),
1236                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1237                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1238                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1239     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1240    #endif
1241 
1242     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243                             A->cmap->n, matrix->num_entries,
1244                             matrix->values->data().get(),
1245                             cusparsestruct->rowoffsets_gpu->data().get(),
1246                             matrix->column_indices->data().get(),
1247                             matrixT->values->data().get(),
1248                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1249                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250                             CUSPARSE_ACTION_NUMERIC,indexBase,
1251                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1252                           #else
1253                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1254                             CUSPARSE_ACTION_NUMERIC, indexBase
1255                           #endif
1256                            );CHKERRCUSPARSE(stat);
1257     matstructT->mat = matrixT;
1258 
1259    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1260     stat = cusparseCreateCsr(&matstructT->matDescr,
1261                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263                              matrixT->values->data().get(),
1264                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266    #endif
1267   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1268    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1269     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1270    #else
1271     CsrMatrix *temp  = new CsrMatrix;
1272     CsrMatrix *tempT = new CsrMatrix;
1273     /* First convert HYB to CSR */
1274     temp->num_rows = A->rmap->n;
1275     temp->num_cols = A->cmap->n;
1276     temp->num_entries = a->nz;
1277     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1279     temp->values = new THRUSTARRAY(a->nz);
1280 
1281     stat = cusparse_hyb2csr(cusparsestruct->handle,
1282                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1283                             temp->values->data().get(),
1284                             temp->row_offsets->data().get(),
1285                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1286 
1287     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1288     tempT->num_rows = A->rmap->n;
1289     tempT->num_cols = A->cmap->n;
1290     tempT->num_entries = a->nz;
1291     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1292     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1293     tempT->values = new THRUSTARRAY(a->nz);
1294 
1295     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1296                             temp->num_cols, temp->num_entries,
1297                             temp->values->data().get(),
1298                             temp->row_offsets->data().get(),
1299                             temp->column_indices->data().get(),
1300                             tempT->values->data().get(),
1301                             tempT->column_indices->data().get(),
1302                             tempT->row_offsets->data().get(),
1303                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1304 
1305     /* Last, convert CSC to HYB */
1306     cusparseHybMat_t hybMat;
1307     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1308     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1311                             matstructT->descr, tempT->values->data().get(),
1312                             tempT->row_offsets->data().get(),
1313                             tempT->column_indices->data().get(),
1314                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1315 
1316     /* assign the pointer */
1317     matstructT->mat = hybMat;
1318     /* delete temporaries */
1319     if (tempT) {
1320       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323       delete (CsrMatrix*) tempT;
1324     }
1325     if (temp) {
1326       if (temp->values) delete (THRUSTARRAY*) temp->values;
1327       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329       delete (CsrMatrix*) temp;
1330     }
1331    #endif
1332   }
1333   err  = WaitForCUDA();CHKERRCUDA(err);
1334   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1335   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1336   /* the compressed row indices is not used for matTranspose */
1337   matstructT->cprowIndices = NULL;
1338   /* assign the pointer */
1339   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1344 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1345 {
1346   PetscInt                              n = xx->map->n;
1347   const PetscScalar                     *barray;
1348   PetscScalar                           *xarray;
1349   thrust::device_ptr<const PetscScalar> bGPU;
1350   thrust::device_ptr<PetscScalar>       xGPU;
1351   cusparseStatus_t                      stat;
1352   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1353   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1354   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1355   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1356   PetscErrorCode                        ierr;
1357   cudaError_t                           cerr;
1358 
1359   PetscFunctionBegin;
1360   /* Analyze the matrix and create the transpose ... on the fly */
1361   if (!loTriFactorT && !upTriFactorT) {
1362     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1363     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1364     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1365   }
1366 
1367   /* Get the GPU pointers */
1368   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1369   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1370   xGPU = thrust::device_pointer_cast(xarray);
1371   bGPU = thrust::device_pointer_cast(barray);
1372 
1373   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1374   /* First, reorder with the row permutation */
1375   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1376                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1377                xGPU);
1378 
1379   /* First, solve U */
1380   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1381                         upTriFactorT->csrMat->num_rows,
1382                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1383                         upTriFactorT->csrMat->num_entries,
1384                       #endif
1385                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1386                         upTriFactorT->csrMat->values->data().get(),
1387                         upTriFactorT->csrMat->row_offsets->data().get(),
1388                         upTriFactorT->csrMat->column_indices->data().get(),
1389                         upTriFactorT->solveInfo,
1390                         xarray, tempGPU->data().get()
1391                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1392                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1393                       #endif
1394                         );CHKERRCUSPARSE(stat);
1395 
1396   /* Then, solve L */
1397   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1398                         loTriFactorT->csrMat->num_rows,
1399                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1400                         loTriFactorT->csrMat->num_entries,
1401                       #endif
1402                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1403                         loTriFactorT->csrMat->values->data().get(),
1404                         loTriFactorT->csrMat->row_offsets->data().get(),
1405                         loTriFactorT->csrMat->column_indices->data().get(),
1406                         loTriFactorT->solveInfo,
1407                         tempGPU->data().get(), xarray
1408                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1409                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1410                       #endif
1411                         );CHKERRCUSPARSE(stat);
1412 
1413   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1414   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1415                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1416                tempGPU->begin());
1417 
1418   /* Copy the temporary to the full solution. */
1419   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1420 
1421   /* restore */
1422   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1423   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1424   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1425   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1426   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1431 {
1432   const PetscScalar                 *barray;
1433   PetscScalar                       *xarray;
1434   cusparseStatus_t                  stat;
1435   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1436   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1437   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1438   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1439   PetscErrorCode                    ierr;
1440   cudaError_t                       cerr;
1441 
1442   PetscFunctionBegin;
1443   /* Analyze the matrix and create the transpose ... on the fly */
1444   if (!loTriFactorT && !upTriFactorT) {
1445     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1446     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1447     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1448   }
1449 
1450   /* Get the GPU pointers */
1451   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1452   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1453 
1454   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1455   /* First, solve U */
1456   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1457                         upTriFactorT->csrMat->num_rows,
1458                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1459                         upTriFactorT->csrMat->num_entries,
1460                       #endif
1461                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1462                         upTriFactorT->csrMat->values->data().get(),
1463                         upTriFactorT->csrMat->row_offsets->data().get(),
1464                         upTriFactorT->csrMat->column_indices->data().get(),
1465                         upTriFactorT->solveInfo,
1466                         barray, tempGPU->data().get()
1467                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1468                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1469                       #endif
1470                         );CHKERRCUSPARSE(stat);
1471 
1472   /* Then, solve L */
1473   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1474                         loTriFactorT->csrMat->num_rows,
1475                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1476                         loTriFactorT->csrMat->num_entries,
1477                       #endif
1478                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1479                         loTriFactorT->csrMat->values->data().get(),
1480                         loTriFactorT->csrMat->row_offsets->data().get(),
1481                         loTriFactorT->csrMat->column_indices->data().get(),
1482                         loTriFactorT->solveInfo,
1483                         tempGPU->data().get(), xarray
1484                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1485                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1486                       #endif
1487                         );CHKERRCUSPARSE(stat);
1488 
1489   /* restore */
1490   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1491   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1492   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1493   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1494   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1499 {
1500   const PetscScalar                     *barray;
1501   PetscScalar                           *xarray;
1502   thrust::device_ptr<const PetscScalar> bGPU;
1503   thrust::device_ptr<PetscScalar>       xGPU;
1504   cusparseStatus_t                      stat;
1505   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1506   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1507   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1508   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1509   PetscErrorCode                        ierr;
1510   cudaError_t                           cerr;
1511 
1512   PetscFunctionBegin;
1513 
1514   /* Get the GPU pointers */
1515   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1516   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1517   xGPU = thrust::device_pointer_cast(xarray);
1518   bGPU = thrust::device_pointer_cast(barray);
1519 
1520   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1521   /* First, reorder with the row permutation */
1522   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1523                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1524                tempGPU->begin());
1525 
1526   /* Next, solve L */
1527   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1528                         loTriFactor->csrMat->num_rows,
1529                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1530                         loTriFactor->csrMat->num_entries,
1531                       #endif
1532                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1533                         loTriFactor->csrMat->values->data().get(),
1534                         loTriFactor->csrMat->row_offsets->data().get(),
1535                         loTriFactor->csrMat->column_indices->data().get(),
1536                         loTriFactor->solveInfo,
1537                         tempGPU->data().get(), xarray
1538                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1540                       #endif
1541                         );CHKERRCUSPARSE(stat);
1542 
1543   /* Then, solve U */
1544   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1545                         upTriFactor->csrMat->num_rows,
1546                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547                         upTriFactor->csrMat->num_entries,
1548                       #endif
1549                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1550                         upTriFactor->csrMat->values->data().get(),
1551                         upTriFactor->csrMat->row_offsets->data().get(),
1552                         upTriFactor->csrMat->column_indices->data().get(),
1553                         upTriFactor->solveInfo,
1554                         xarray, tempGPU->data().get()
1555                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1556                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1557                       #endif
1558                         );CHKERRCUSPARSE(stat);
1559 
1560   /* Last, reorder with the column permutation */
1561   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1562                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1563                xGPU);
1564 
1565   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1566   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1567   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1574 {
1575   const PetscScalar                 *barray;
1576   PetscScalar                       *xarray;
1577   cusparseStatus_t                  stat;
1578   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1579   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1580   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1581   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1582   PetscErrorCode                    ierr;
1583   cudaError_t                       cerr;
1584 
1585   PetscFunctionBegin;
1586   /* Get the GPU pointers */
1587   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1588   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1589 
1590   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1591   /* First, solve L */
1592   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1593                         loTriFactor->csrMat->num_rows,
1594                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1595                         loTriFactor->csrMat->num_entries,
1596                       #endif
1597                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1598                         loTriFactor->csrMat->values->data().get(),
1599                         loTriFactor->csrMat->row_offsets->data().get(),
1600                         loTriFactor->csrMat->column_indices->data().get(),
1601                         loTriFactor->solveInfo,
1602                         barray, tempGPU->data().get()
1603                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1605                       #endif
1606                         );CHKERRCUSPARSE(stat);
1607 
1608   /* Next, solve U */
1609   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1610                         upTriFactor->csrMat->num_rows,
1611                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612                         upTriFactor->csrMat->num_entries,
1613                       #endif
1614                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1615                         upTriFactor->csrMat->values->data().get(),
1616                         upTriFactor->csrMat->row_offsets->data().get(),
1617                         upTriFactor->csrMat->column_indices->data().get(),
1618                         upTriFactor->solveInfo,
1619                         tempGPU->data().get(), xarray
1620                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1621                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1622                       #endif
1623                         );CHKERRCUSPARSE(stat);
1624 
1625   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1626   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1627   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1628   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1629   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1634 {
1635   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1636   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1637   cudaError_t        cerr;
1638   PetscErrorCode     ierr;
1639 
1640   PetscFunctionBegin;
1641   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1642     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1643 
1644     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1645     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1646     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1647     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1648     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1649     A->offloadmask = PETSC_OFFLOAD_BOTH;
1650   }
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1655 {
1656   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1657   PetscErrorCode ierr;
1658 
1659   PetscFunctionBegin;
1660   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1661   *array = a->a;
1662   A->offloadmask = PETSC_OFFLOAD_CPU;
1663   PetscFunctionReturn(0);
1664 }
1665 
1666 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1667 {
1668   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1669   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1670   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1671   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1672   PetscErrorCode               ierr;
1673   cusparseStatus_t             stat;
1674   cudaError_t                  err;
1675 
1676   PetscFunctionBegin;
1677   if (A->boundtocpu) PetscFunctionReturn(0);
1678   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1679     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1680       /* Copy values only */
1681       CsrMatrix *matrix,*matrixT;
1682       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1683 
1684       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1685       matrix->values->assign(a->a, a->a+a->nz);
1686       err  = WaitForCUDA();CHKERRCUDA(err);
1687       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1688       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1689 
1690       /* Update matT when it was built before */
1691       if (cusparsestruct->matTranspose) {
1692         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1693         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1694         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1695         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1696                             A->cmap->n, matrix->num_entries,
1697                             matrix->values->data().get(),
1698                             cusparsestruct->rowoffsets_gpu->data().get(),
1699                             matrix->column_indices->data().get(),
1700                             matrixT->values->data().get(),
1701                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1702                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1703                             CUSPARSE_ACTION_NUMERIC,indexBase,
1704                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1705                           #else
1706                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1707                             CUSPARSE_ACTION_NUMERIC, indexBase
1708                           #endif
1709                            );CHKERRCUSPARSE(stat);
1710         err  = WaitForCUDA();CHKERRCUDA(err);
1711         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1712       }
1713     } else {
1714       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1715       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1716       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1717       delete cusparsestruct->workVector;
1718       delete cusparsestruct->rowoffsets_gpu;
1719       try {
1720         if (a->compressedrow.use) {
1721           m    = a->compressedrow.nrows;
1722           ii   = a->compressedrow.i;
1723           ridx = a->compressedrow.rindex;
1724         } else {
1725           m    = A->rmap->n;
1726           ii   = a->i;
1727           ridx = NULL;
1728         }
1729         cusparsestruct->nrows = m;
1730 
1731         /* create cusparse matrix */
1732         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1733         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1734         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1735         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1736 
1737         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1738         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1739         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1740         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1741         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1742         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1743         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1744 
1745         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1746         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1747           /* set the matrix */
1748           CsrMatrix *mat= new CsrMatrix;
1749           mat->num_rows = m;
1750           mat->num_cols = A->cmap->n;
1751           mat->num_entries = a->nz;
1752           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1753           mat->row_offsets->assign(ii, ii + m+1);
1754 
1755           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1756           mat->column_indices->assign(a->j, a->j+a->nz);
1757 
1758           mat->values = new THRUSTARRAY(a->nz);
1759           mat->values->assign(a->a, a->a+a->nz);
1760 
1761           /* assign the pointer */
1762           matstruct->mat = mat;
1763          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1764           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1765             stat = cusparseCreateCsr(&matstruct->matDescr,
1766                                     mat->num_rows, mat->num_cols, mat->num_entries,
1767                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1768                                     mat->values->data().get(),
1769                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1770                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1771           }
1772          #endif
1773         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1774          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1775           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1776          #else
1777           CsrMatrix *mat= new CsrMatrix;
1778           mat->num_rows = m;
1779           mat->num_cols = A->cmap->n;
1780           mat->num_entries = a->nz;
1781           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1782           mat->row_offsets->assign(ii, ii + m+1);
1783 
1784           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1785           mat->column_indices->assign(a->j, a->j+a->nz);
1786 
1787           mat->values = new THRUSTARRAY(a->nz);
1788           mat->values->assign(a->a, a->a+a->nz);
1789 
1790           cusparseHybMat_t hybMat;
1791           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1792           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1793             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1794           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1795               matstruct->descr, mat->values->data().get(),
1796               mat->row_offsets->data().get(),
1797               mat->column_indices->data().get(),
1798               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1799           /* assign the pointer */
1800           matstruct->mat = hybMat;
1801 
1802           if (mat) {
1803             if (mat->values) delete (THRUSTARRAY*)mat->values;
1804             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1805             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1806             delete (CsrMatrix*)mat;
1807           }
1808          #endif
1809         }
1810 
1811         /* assign the compressed row indices */
1812         if (a->compressedrow.use) {
1813           cusparsestruct->workVector = new THRUSTARRAY(m);
1814           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1815           matstruct->cprowIndices->assign(ridx,ridx+m);
1816           tmp = m;
1817         } else {
1818           cusparsestruct->workVector = NULL;
1819           matstruct->cprowIndices    = NULL;
1820           tmp = 0;
1821         }
1822         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1823 
1824         /* assign the pointer */
1825         cusparsestruct->mat = matstruct;
1826       } catch(char *ex) {
1827         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1828       }
1829       err  = WaitForCUDA();CHKERRCUDA(err);
1830       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1831       cusparsestruct->nonzerostate = A->nonzerostate;
1832     }
1833     A->offloadmask = PETSC_OFFLOAD_BOTH;
1834   }
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 struct VecCUDAPlusEquals
1839 {
1840   template <typename Tuple>
1841   __host__ __device__
1842   void operator()(Tuple t)
1843   {
1844     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1845   }
1846 };
1847 
1848 struct VecCUDAEquals
1849 {
1850   template <typename Tuple>
1851   __host__ __device__
1852   void operator()(Tuple t)
1853   {
1854     thrust::get<1>(t) = thrust::get<0>(t);
1855   }
1856 };
1857 
1858 struct VecCUDAEqualsReverse
1859 {
1860   template <typename Tuple>
1861   __host__ __device__
1862   void operator()(Tuple t)
1863   {
1864     thrust::get<0>(t) = thrust::get<1>(t);
1865   }
1866 };
1867 
1868 struct MatMatCusparse {
1869   PetscBool            cisdense;
1870   PetscScalar          *Bt;
1871   Mat                  X;
1872 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1873   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1874   cusparseDnMatDescr_t matBDescr;
1875   cusparseDnMatDescr_t matCDescr;
1876   size_t               spmmBufferSize;
1877   void                 *spmmBuffer;
1878   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1879 #endif
1880 };
1881 
1882 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1883 {
1884   PetscErrorCode ierr;
1885   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1886   cudaError_t    cerr;
1887 
1888   PetscFunctionBegin;
1889   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1890  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1891   cusparseStatus_t stat;
1892   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1893   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1894   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1895  #endif
1896   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1897   ierr = PetscFree(data);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1902 
1903 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1904 {
1905   Mat_Product                  *product = C->product;
1906   Mat                          A,B;
1907   PetscInt                     m,n,blda,clda;
1908   PetscBool                    flg,biscuda;
1909   Mat_SeqAIJCUSPARSE           *cusp;
1910   cusparseStatus_t             stat;
1911   cusparseOperation_t          opA;
1912   const PetscScalar            *barray;
1913   PetscScalar                  *carray;
1914   PetscErrorCode               ierr;
1915   MatMatCusparse               *mmdata;
1916   Mat_SeqAIJCUSPARSEMultStruct *mat;
1917   CsrMatrix                    *csrmat;
1918   cudaError_t                  cerr;
1919 
1920   PetscFunctionBegin;
1921   MatCheckProduct(C,1);
1922   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1923   mmdata = (MatMatCusparse*)product->data;
1924   A    = product->A;
1925   B    = product->B;
1926   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1927   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1928   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1929      Instead of silently accepting the wrong answer, I prefer to raise the error */
1930   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1931   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1932   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1933   switch (product->type) {
1934   case MATPRODUCT_AB:
1935   case MATPRODUCT_PtAP:
1936     mat = cusp->mat;
1937     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1938     m   = A->rmap->n;
1939     n   = B->cmap->n;
1940     break;
1941   case MATPRODUCT_AtB:
1942     if (!cusp->transgen) {
1943       mat = cusp->mat;
1944       opA = CUSPARSE_OPERATION_TRANSPOSE;
1945     } else {
1946       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1947       mat  = cusp->matTranspose;
1948       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1949     }
1950     m = A->cmap->n;
1951     n = B->cmap->n;
1952     break;
1953   case MATPRODUCT_ABt:
1954   case MATPRODUCT_RARt:
1955     mat = cusp->mat;
1956     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1957     m   = A->rmap->n;
1958     n   = B->rmap->n;
1959     break;
1960   default:
1961     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1962   }
1963   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1964   csrmat = (CsrMatrix*)mat->mat;
1965   /* if the user passed a CPU matrix, copy the data to the GPU */
1966   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1967   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1968   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1969 
1970   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1971   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1972     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1973     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1974   } else {
1975     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1976     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1977   }
1978 
1979   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1980  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1981   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1982   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1983   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1984     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1985     if (!mmdata->matBDescr) {
1986       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1987       mmdata->Blda = blda;
1988     }
1989 
1990     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1991     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1992       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1993       mmdata->Clda = clda;
1994     }
1995 
1996     if (!mat->matDescr) {
1997       stat = cusparseCreateCsr(&mat->matDescr,
1998                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1999                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2000                               csrmat->values->data().get(),
2001                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2002                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2003     }
2004     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2005                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2006                                    mmdata->matCDescr,cusparse_scalartype,
2007                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2008     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2009     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2010     mmdata->initialized = PETSC_TRUE;
2011   } else {
2012     /* to be safe, always update pointers of the mats */
2013     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2014     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2015     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2016   }
2017 
2018   /* do cusparseSpMM, which supports transpose on B */
2019   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2020                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2021                       mmdata->matCDescr,cusparse_scalartype,
2022                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2023  #else
2024   PetscInt k;
2025   /* cusparseXcsrmm does not support transpose on B */
2026   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2027     cublasHandle_t cublasv2handle;
2028     cublasStatus_t cerr;
2029 
2030     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2031     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2032                        B->cmap->n,B->rmap->n,
2033                        &PETSC_CUSPARSE_ONE ,barray,blda,
2034                        &PETSC_CUSPARSE_ZERO,barray,blda,
2035                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2036     blda = B->cmap->n;
2037     k    = B->cmap->n;
2038   } else {
2039     k    = B->rmap->n;
2040   }
2041 
2042   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2043   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2044                            csrmat->num_entries,mat->alpha_one,mat->descr,
2045                            csrmat->values->data().get(),
2046                            csrmat->row_offsets->data().get(),
2047                            csrmat->column_indices->data().get(),
2048                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2049                            carray,clda);CHKERRCUSPARSE(stat);
2050  #endif
2051   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2052   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2053   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2054   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2055   if (product->type == MATPRODUCT_RARt) {
2056     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2057     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2058   } else if (product->type == MATPRODUCT_PtAP) {
2059     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2060     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2061   } else {
2062     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2063   }
2064   if (mmdata->cisdense) {
2065     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2066   }
2067   if (!biscuda) {
2068     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2069   }
2070   PetscFunctionReturn(0);
2071 }
2072 
2073 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2074 {
2075   Mat_Product        *product = C->product;
2076   Mat                A,B;
2077   PetscInt           m,n;
2078   PetscBool          cisdense,flg;
2079   PetscErrorCode     ierr;
2080   MatMatCusparse     *mmdata;
2081   Mat_SeqAIJCUSPARSE *cusp;
2082 
2083   PetscFunctionBegin;
2084   MatCheckProduct(C,1);
2085   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2086   A    = product->A;
2087   B    = product->B;
2088   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2089   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2090   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2091   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2092   switch (product->type) {
2093   case MATPRODUCT_AB:
2094     m = A->rmap->n;
2095     n = B->cmap->n;
2096     break;
2097   case MATPRODUCT_AtB:
2098     m = A->cmap->n;
2099     n = B->cmap->n;
2100     break;
2101   case MATPRODUCT_ABt:
2102     m = A->rmap->n;
2103     n = B->rmap->n;
2104     break;
2105   case MATPRODUCT_PtAP:
2106     m = B->cmap->n;
2107     n = B->cmap->n;
2108     break;
2109   case MATPRODUCT_RARt:
2110     m = B->rmap->n;
2111     n = B->rmap->n;
2112     break;
2113   default:
2114     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2115   }
2116   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2117   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2118   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2119   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2120 
2121   /* product data */
2122   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2123   mmdata->cisdense = cisdense;
2124  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2125   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2126   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2127     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2128   }
2129  #endif
2130   /* for these products we need intermediate storage */
2131   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2132     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2133     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2134     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2135       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2136     } else {
2137       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2138     }
2139   }
2140   C->product->data    = mmdata;
2141   C->product->destroy = MatDestroy_MatMatCusparse;
2142 
2143   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2144   PetscFunctionReturn(0);
2145 }
2146 
2147 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2148 
2149 /* handles dense B */
2150 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2151 {
2152   Mat_Product    *product = C->product;
2153   PetscErrorCode ierr;
2154 
2155   PetscFunctionBegin;
2156   MatCheckProduct(C,1);
2157   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2158   if (product->A->boundtocpu) {
2159     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2160     PetscFunctionReturn(0);
2161   }
2162   switch (product->type) {
2163   case MATPRODUCT_AB:
2164   case MATPRODUCT_AtB:
2165   case MATPRODUCT_ABt:
2166   case MATPRODUCT_PtAP:
2167   case MATPRODUCT_RARt:
2168     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2169   default:
2170     break;
2171   }
2172   PetscFunctionReturn(0);
2173 }
2174 
2175 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2176 {
2177   PetscErrorCode ierr;
2178 
2179   PetscFunctionBegin;
2180   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2181   PetscFunctionReturn(0);
2182 }
2183 
2184 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2185 {
2186   PetscErrorCode ierr;
2187 
2188   PetscFunctionBegin;
2189   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2190   PetscFunctionReturn(0);
2191 }
2192 
2193 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2194 {
2195   PetscErrorCode ierr;
2196 
2197   PetscFunctionBegin;
2198   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2199   PetscFunctionReturn(0);
2200 }
2201 
2202 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2203 {
2204   PetscErrorCode ierr;
2205 
2206   PetscFunctionBegin;
2207   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2208   PetscFunctionReturn(0);
2209 }
2210 
2211 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2212 {
2213   PetscErrorCode ierr;
2214 
2215   PetscFunctionBegin;
2216   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2217   PetscFunctionReturn(0);
2218 }
2219 
2220 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2221 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2222 {
2223   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2224   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2225   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2226   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2227   PetscErrorCode               ierr;
2228   cudaError_t                  cerr;
2229   cusparseStatus_t             stat;
2230   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2231   PetscBool                    compressed;
2232 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2233   PetscInt                     nx,ny;
2234 #endif
2235 
2236   PetscFunctionBegin;
2237   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2238   if (!a->nonzerorowcnt) {
2239     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2240     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2241     PetscFunctionReturn(0);
2242   }
2243   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2244   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2245   if (!trans) {
2246     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2247     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2248   } else {
2249     if (herm || !cusparsestruct->transgen) {
2250       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2251       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2252     } else {
2253       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2254       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2255     }
2256   }
2257   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2258   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2259 
2260   try {
2261     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2262     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2263     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2264 
2265     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2266     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2267       /* z = A x + beta y.
2268          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2269          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2270       */
2271       xptr = xarray;
2272       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2273       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2274      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2275       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2276           allocated to accommodate different uses. So we get the length info directly from mat.
2277        */
2278       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2279         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2280         nx = mat->num_cols;
2281         ny = mat->num_rows;
2282       }
2283      #endif
2284     } else {
2285       /* z = A^T x + beta y
2286          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2287          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2288        */
2289       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2290       dptr = zarray;
2291       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2292       if (compressed) { /* Scatter x to work vector */
2293         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2294         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2295                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2296                          VecCUDAEqualsReverse());
2297       }
2298      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2299       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2300         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2301         nx = mat->num_rows;
2302         ny = mat->num_cols;
2303       }
2304      #endif
2305     }
2306 
2307     /* csr_spmv does y = alpha op(A) x + beta y */
2308     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2309      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2310       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2311       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2312         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2313         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2314         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2315                                 matstruct->matDescr,
2316                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2317                                 matstruct->cuSpMV[opA].vecYDescr,
2318                                 cusparse_scalartype,
2319                                 cusparsestruct->spmvAlg,
2320                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2321         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2322 
2323         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2324       } else {
2325         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2326         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2327         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2328       }
2329 
2330       stat = cusparseSpMV(cusparsestruct->handle, opA,
2331                                matstruct->alpha_one,
2332                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2333                                matstruct->cuSpMV[opA].vecXDescr,
2334                                beta,
2335                                matstruct->cuSpMV[opA].vecYDescr,
2336                                cusparse_scalartype,
2337                                cusparsestruct->spmvAlg,
2338                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2339      #else
2340       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2341       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2342                                mat->num_rows, mat->num_cols,
2343                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2344                                mat->values->data().get(), mat->row_offsets->data().get(),
2345                                mat->column_indices->data().get(), xptr, beta,
2346                                dptr);CHKERRCUSPARSE(stat);
2347      #endif
2348     } else {
2349       if (cusparsestruct->nrows) {
2350        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2351         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2352        #else
2353         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2354         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2355                                  matstruct->alpha_one, matstruct->descr, hybMat,
2356                                  xptr, beta,
2357                                  dptr);CHKERRCUSPARSE(stat);
2358        #endif
2359       }
2360     }
2361     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2362     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2363 
2364     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2365       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2366         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2367           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2368         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2369           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2370         }
2371       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2372         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2373       }
2374 
2375       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2376       if (compressed) {
2377         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2378         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2379         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2380                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2381                          VecCUDAPlusEquals());
2382         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2383         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2384       }
2385     } else {
2386       if (yy && yy != zz) {
2387         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2388       }
2389     }
2390     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2391     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2392     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2393   } catch(char *ex) {
2394     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2395   }
2396   if (yy) {
2397     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2398   } else {
2399     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2400   }
2401   PetscFunctionReturn(0);
2402 }
2403 
2404 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2405 {
2406   PetscErrorCode ierr;
2407 
2408   PetscFunctionBegin;
2409   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2410   PetscFunctionReturn(0);
2411 }
2412 
2413 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2414 {
2415   PetscErrorCode              ierr;
2416   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2417   PetscBool                   is_seq = PETSC_TRUE;
2418   PetscInt                    nnz_state = A->nonzerostate;
2419   PetscFunctionBegin;
2420   if (A->factortype == MAT_FACTOR_NONE) {
2421     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2422   }
2423   if (d_mat) {
2424     cudaError_t err;
2425     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2426     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2427     nnz_state = h_mat.nonzerostate;
2428     is_seq = h_mat.seq;
2429   }
2430   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2431   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2432   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2433     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2434   } else if (nnz_state > A->nonzerostate) {
2435     A->offloadmask = PETSC_OFFLOAD_GPU;
2436   }
2437 
2438   PetscFunctionReturn(0);
2439 }
2440 
2441 /* --------------------------------------------------------------------------------*/
2442 /*@
2443    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2444    (the default parallel PETSc format). This matrix will ultimately pushed down
2445    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2446    assembly performance the user should preallocate the matrix storage by setting
2447    the parameter nz (or the array nnz).  By setting these parameters accurately,
2448    performance during matrix assembly can be increased by more than a factor of 50.
2449 
2450    Collective
2451 
2452    Input Parameters:
2453 +  comm - MPI communicator, set to PETSC_COMM_SELF
2454 .  m - number of rows
2455 .  n - number of columns
2456 .  nz - number of nonzeros per row (same for all rows)
2457 -  nnz - array containing the number of nonzeros in the various rows
2458          (possibly different for each row) or NULL
2459 
2460    Output Parameter:
2461 .  A - the matrix
2462 
2463    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2464    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2465    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2466 
2467    Notes:
2468    If nnz is given then nz is ignored
2469 
2470    The AIJ format (also called the Yale sparse matrix format or
2471    compressed row storage), is fully compatible with standard Fortran 77
2472    storage.  That is, the stored row and column indices can begin at
2473    either one (as in Fortran) or zero.  See the users' manual for details.
2474 
2475    Specify the preallocated storage with either nz or nnz (not both).
2476    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2477    allocation.  For large problems you MUST preallocate memory or you
2478    will get TERRIBLE performance, see the users' manual chapter on matrices.
2479 
2480    By default, this format uses inodes (identical nodes) when possible, to
2481    improve numerical efficiency of matrix-vector products and solves. We
2482    search for consecutive rows with the same nonzero structure, thereby
2483    reusing matrix information to achieve increased efficiency.
2484 
2485    Level: intermediate
2486 
2487 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2488 @*/
2489 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2490 {
2491   PetscErrorCode ierr;
2492 
2493   PetscFunctionBegin;
2494   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2495   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2496   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2497   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2498   PetscFunctionReturn(0);
2499 }
2500 
2501 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2502 {
2503   PetscErrorCode              ierr;
2504   PetscSplitCSRDataStructure  *d_mat = NULL;
2505 
2506   PetscFunctionBegin;
2507   if (A->factortype == MAT_FACTOR_NONE) {
2508     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2509     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2510     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2511   } else {
2512     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2513   }
2514   if (d_mat) {
2515     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2516     cudaError_t                err;
2517     PetscSplitCSRDataStructure h_mat;
2518     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2519     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2520     if (h_mat.seq) {
2521       if (a->compressedrow.use) {
2522  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2523       }
2524       err = cudaFree(d_mat);CHKERRCUDA(err);
2525     }
2526   }
2527   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2528   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2529   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2530   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2531   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2532   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2533   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2534   PetscFunctionReturn(0);
2535 }
2536 
2537 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2538 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2539 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2540 {
2541   PetscErrorCode ierr;
2542 
2543   PetscFunctionBegin;
2544   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2545   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2546   PetscFunctionReturn(0);
2547 }
2548 
2549 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2550 {
2551   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2552   PetscErrorCode ierr;
2553 
2554   PetscFunctionBegin;
2555   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2556   if (flg) {
2557     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2558 
2559     A->ops->mult                      = MatMult_SeqAIJ;
2560     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2561     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2562     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2563     A->ops->multhermitiantranspose    = NULL;
2564     A->ops->multhermitiantransposeadd = NULL;
2565     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2566     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2567     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2568     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2569     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2570   } else {
2571     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2572     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2573     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2574     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2575     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2576     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2578     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2579     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2580     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2581     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2582   }
2583   A->boundtocpu = flg;
2584   a->inode.use = flg;
2585   PetscFunctionReturn(0);
2586 }
2587 
2588 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2589 {
2590   PetscSplitCSRDataStructure *d_mat = NULL;
2591   PetscErrorCode             ierr;
2592   PetscBool                  both = PETSC_FALSE;
2593 
2594   PetscFunctionBegin;
2595   if (A->factortype == MAT_FACTOR_NONE) {
2596     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2597     if (spptr->mat) {
2598       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2599       if (matrix->values) {
2600         both = PETSC_TRUE;
2601         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2602       }
2603     }
2604     if (spptr->matTranspose) {
2605       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2606       if (matrix->values) {
2607         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2608       }
2609     }
2610     d_mat = spptr->deviceMat;
2611   }
2612   if (d_mat) {
2613     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2614     PetscInt     n = A->rmap->n, nnz = a->i[n];
2615     cudaError_t  err;
2616     PetscScalar  *vals;
2617     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2618     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2619     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2620   }
2621   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2622   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2623 
2624   PetscFunctionReturn(0);
2625 }
2626 
2627 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2628 {
2629   PetscErrorCode   ierr;
2630   cusparseStatus_t stat;
2631   Mat              B;
2632 
2633   PetscFunctionBegin;
2634   if (reuse == MAT_INITIAL_MATRIX) {
2635     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2636   } else if (reuse == MAT_REUSE_MATRIX) {
2637     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2638   }
2639   B = *newmat;
2640 
2641   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2642   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2643 
2644   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2645     if (B->factortype == MAT_FACTOR_NONE) {
2646       Mat_SeqAIJCUSPARSE *spptr;
2647 
2648       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2649       spptr->format = MAT_CUSPARSE_CSR;
2650       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2651       B->spptr = spptr;
2652       spptr->deviceMat = NULL;
2653     } else {
2654       Mat_SeqAIJCUSPARSETriFactors *spptr;
2655 
2656       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2657       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2658       B->spptr = spptr;
2659     }
2660     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2661   }
2662   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2663   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2664   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2665   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2666   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2667   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
2668 
2669   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2670   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2671   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2672   PetscFunctionReturn(0);
2673 }
2674 
2675 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2676 {
2677   PetscErrorCode ierr;
2678 
2679   PetscFunctionBegin;
2680   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2681   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2682   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2683   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2684   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2685   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2686   PetscFunctionReturn(0);
2687 }
2688 
2689 /*MC
2690    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2691 
2692    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2693    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2694    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2695 
2696    Options Database Keys:
2697 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2698 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2699 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2700 
2701   Level: beginner
2702 
2703 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2704 M*/
2705 
2706 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2707 
2708 
2709 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2710 {
2711   PetscErrorCode ierr;
2712 
2713   PetscFunctionBegin;
2714   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2715   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2716   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2717   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2718   PetscFunctionReturn(0);
2719 }
2720 
2721 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2722 {
2723   PetscErrorCode   ierr;
2724   cusparseStatus_t stat;
2725 
2726   PetscFunctionBegin;
2727   if (*cusparsestruct) {
2728     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2729     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2730     delete (*cusparsestruct)->workVector;
2731     delete (*cusparsestruct)->rowoffsets_gpu;
2732     delete (*cusparsestruct)->cooPerm;
2733     delete (*cusparsestruct)->cooPerm_a;
2734     delete (*cusparsestruct)->cooPerm_v;
2735     delete (*cusparsestruct)->cooPerm_w;
2736     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2737    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2738     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2739    #endif
2740     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2741   }
2742   PetscFunctionReturn(0);
2743 }
2744 
2745 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2746 {
2747   PetscFunctionBegin;
2748   if (*mat) {
2749     delete (*mat)->values;
2750     delete (*mat)->column_indices;
2751     delete (*mat)->row_offsets;
2752     delete *mat;
2753     *mat = 0;
2754   }
2755   PetscFunctionReturn(0);
2756 }
2757 
2758 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2759 {
2760   cusparseStatus_t stat;
2761   PetscErrorCode   ierr;
2762 
2763   PetscFunctionBegin;
2764   if (*trifactor) {
2765     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2766     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2767     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2768     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2769    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2770     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2771    #endif
2772     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
2773   }
2774   PetscFunctionReturn(0);
2775 }
2776 
2777 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2778 {
2779   CsrMatrix        *mat;
2780   cusparseStatus_t stat;
2781   cudaError_t      err;
2782 
2783   PetscFunctionBegin;
2784   if (*matstruct) {
2785     if ((*matstruct)->mat) {
2786       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2787        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2788         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2789        #else
2790         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2791         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2792        #endif
2793       } else {
2794         mat = (CsrMatrix*)(*matstruct)->mat;
2795         CsrMatrix_Destroy(&mat);
2796       }
2797     }
2798     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2799     delete (*matstruct)->cprowIndices;
2800     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2801     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2802     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2803 
2804    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2805     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2806     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2807     for (int i=0; i<3; i++) {
2808       if (mdata->cuSpMV[i].initialized) {
2809         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2810         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2811         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2812       }
2813     }
2814    #endif
2815     delete *matstruct;
2816     *matstruct = NULL;
2817   }
2818   PetscFunctionReturn(0);
2819 }
2820 
2821 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2822 {
2823   PetscErrorCode ierr;
2824 
2825   PetscFunctionBegin;
2826   if (*trifactors) {
2827     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2828     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2829     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2830     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2831     delete (*trifactors)->rpermIndices;
2832     delete (*trifactors)->cpermIndices;
2833     delete (*trifactors)->workVector;
2834     (*trifactors)->rpermIndices = NULL;
2835     (*trifactors)->cpermIndices = NULL;
2836     (*trifactors)->workVector = NULL;
2837   }
2838   PetscFunctionReturn(0);
2839 }
2840 
2841 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2842 {
2843   PetscErrorCode   ierr;
2844   cusparseHandle_t handle;
2845   cusparseStatus_t stat;
2846 
2847   PetscFunctionBegin;
2848   if (*trifactors) {
2849     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2850     if (handle = (*trifactors)->handle) {
2851       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2852     }
2853     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2854   }
2855   PetscFunctionReturn(0);
2856 }
2857 
2858 struct IJCompare
2859 {
2860   __host__ __device__
2861   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2862   {
2863     if (t1.get<0>() < t2.get<0>()) return true;
2864     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2865     return false;
2866   }
2867 };
2868 
2869 struct IJEqual
2870 {
2871   __host__ __device__
2872   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2873   {
2874     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2875     return true;
2876   }
2877 };
2878 
2879 struct IJDiff
2880 {
2881   __host__ __device__
2882   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2883   {
2884     return t1 == t2 ? 0 : 1;
2885   }
2886 };
2887 
2888 struct IJSum
2889 {
2890   __host__ __device__
2891   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2892   {
2893     return t1||t2;
2894   }
2895 };
2896 
2897 #include <thrust/iterator/discard_iterator.h>
2898 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2899 {
2900   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2901   CsrMatrix          *matrix;
2902   PetscErrorCode     ierr;
2903   cudaError_t        cerr;
2904   PetscInt           n;
2905 
2906   PetscFunctionBegin;
2907   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2908   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2909   if (!cusp->cooPerm) {
2910     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2911     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2912     PetscFunctionReturn(0);
2913   }
2914   n = cusp->cooPerm->size();
2915   matrix = (CsrMatrix*)cusp->mat->mat;
2916   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2917   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2918   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2919   if (v) {
2920     cusp->cooPerm_v->assign(v,v+n);
2921     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2922   }
2923   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2924   if (imode == ADD_VALUES) {
2925     if (cusp->cooPerm_a) {
2926       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2927       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2928       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2929       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2930     } else {
2931       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2932                                                                 matrix->values->begin()));
2933       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2934                                                                 matrix->values->end()));
2935       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2936     }
2937   } else {
2938     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2939                               if we are inserting two different values into the same location */
2940       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2941                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
2942       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2943                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
2944       thrust::for_each(zibit,zieit,VecCUDAEquals());
2945     } else {
2946       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2947                                                                 matrix->values->begin()));
2948       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2949                                                                 matrix->values->end()));
2950       thrust::for_each(zibit,zieit,VecCUDAEquals());
2951     }
2952   }
2953   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2954   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2955   A->offloadmask = PETSC_OFFLOAD_GPU;
2956   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2957   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2958   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2959   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2960   PetscFunctionReturn(0);
2961 }
2962 
2963 #include <thrust/binary_search.h>
2964 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
2965 {
2966   PetscErrorCode     ierr;
2967   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2968   CsrMatrix          *matrix;
2969   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2970   PetscInt           cooPerm_n, nzr = 0;
2971   cudaError_t        cerr;
2972 
2973   PetscFunctionBegin;
2974   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2975   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
2976   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
2977   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
2978   if (n != cooPerm_n) {
2979     delete cusp->cooPerm;
2980     delete cusp->cooPerm_v;
2981     delete cusp->cooPerm_w;
2982     delete cusp->cooPerm_a;
2983     cusp->cooPerm = NULL;
2984     cusp->cooPerm_v = NULL;
2985     cusp->cooPerm_w = NULL;
2986     cusp->cooPerm_a = NULL;
2987   }
2988   if (n) {
2989     THRUSTINTARRAY d_i(n);
2990     THRUSTINTARRAY d_j(n);
2991     THRUSTINTARRAY ii(A->rmap->n);
2992 
2993     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
2994     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
2995 
2996     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
2997     d_i.assign(coo_i,coo_i+n);
2998     d_j.assign(coo_j,coo_j+n);
2999     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3000     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3001 
3002     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3003     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3004     *cusp->cooPerm_a = d_i;
3005     THRUSTINTARRAY w = d_j;
3006 
3007     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3008     if (nekey == ekey) { /* all entries are unique */
3009       delete cusp->cooPerm_a;
3010       cusp->cooPerm_a = NULL;
3011     } else { /* I couldn't come up with a more elegant algorithm */
3012       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3013       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3014       (*cusp->cooPerm_a)[0] = 0;
3015       w[0] = 0;
3016       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3017       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3018     }
3019     thrust::counting_iterator<PetscInt> search_begin(0);
3020     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3021                         search_begin, search_begin + A->rmap->n,
3022                         ii.begin());
3023 
3024     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3025     a->singlemalloc = PETSC_FALSE;
3026     a->free_a       = PETSC_TRUE;
3027     a->free_ij      = PETSC_TRUE;
3028     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3029     a->i[0] = 0;
3030     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3031     a->nz = a->maxnz = a->i[A->rmap->n];
3032     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3033     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3034     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3035     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3036     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3037     for (PetscInt i = 0; i < A->rmap->n; i++) {
3038       const PetscInt nnzr = a->i[i+1] - a->i[i];
3039       nzr += (PetscInt)!!(nnzr);
3040       a->ilen[i] = a->imax[i] = nnzr;
3041     }
3042     A->preallocated = PETSC_TRUE;
3043     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3044   } else {
3045     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3046   }
3047   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3048   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3049 
3050   /* We want to allocate the CUSPARSE struct for matvec now.
3051      The code is so convoluted now that I prefer to copy garbage to the GPU */
3052   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3053   A->offloadmask = PETSC_OFFLOAD_CPU;
3054   A->nonzerostate++;
3055   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3056   {
3057     matrix = (CsrMatrix*)cusp->mat->mat;
3058     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3059     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3060     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3061   }
3062 
3063   A->offloadmask = PETSC_OFFLOAD_CPU;
3064   A->assembled = PETSC_FALSE;
3065   A->was_assembled = PETSC_FALSE;
3066   PetscFunctionReturn(0);
3067 }
3068